##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(moments)
library(skimr)
library(RANN)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR2)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
##################################
# Loading dataset
##################################
data(GermanCredit)
##################################
# Performing a general exploration of the dataset
##################################
dim(GermanCredit)
## [1] 1000 62
str(GermanCredit)
## 'data.frame': 1000 obs. of 62 variables:
## $ Duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ Amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ InstallmentRatePercentage : int 4 2 2 2 3 2 3 2 2 4 ...
## $ ResidenceDuration : int 4 2 3 4 4 4 4 2 4 2 ...
## $ Age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ NumberExistingCredits : int 2 1 1 1 2 1 1 1 1 2 ...
## $ NumberPeopleMaintenance : int 1 1 2 2 2 2 1 1 1 1 ...
## $ Telephone : num 0 1 1 1 1 0 1 0 1 1 ...
## $ ForeignWorker : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Class : Factor w/ 2 levels "Bad","Good": 2 1 2 2 1 2 2 2 2 1 ...
## $ CheckingAccountStatus.lt.0 : num 1 0 0 1 1 0 0 0 0 0 ...
## $ CheckingAccountStatus.0.to.200 : num 0 1 0 0 0 0 0 1 0 1 ...
## $ CheckingAccountStatus.gt.200 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CheckingAccountStatus.none : num 0 0 1 0 0 1 1 0 1 0 ...
## $ CreditHistory.NoCredit.AllPaid : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CreditHistory.ThisBank.AllPaid : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CreditHistory.PaidDuly : num 0 1 0 1 0 1 1 1 1 0 ...
## $ CreditHistory.Delay : num 0 0 0 0 1 0 0 0 0 0 ...
## $ CreditHistory.Critical : num 1 0 1 0 0 0 0 0 0 1 ...
## $ Purpose.NewCar : num 0 0 0 0 1 0 0 0 0 1 ...
## $ Purpose.UsedCar : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Purpose.Furniture.Equipment : num 0 0 0 1 0 0 1 0 0 0 ...
## $ Purpose.Radio.Television : num 1 1 0 0 0 0 0 0 1 0 ...
## $ Purpose.DomesticAppliance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Repairs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Education : num 0 0 1 0 0 1 0 0 0 0 ...
## $ Purpose.Vacation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Retraining : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Business : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SavingsAccountBonds.lt.100 : num 0 1 1 1 1 0 0 1 0 1 ...
## $ SavingsAccountBonds.100.to.500 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SavingsAccountBonds.500.to.1000 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ SavingsAccountBonds.gt.1000 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ SavingsAccountBonds.Unknown : num 1 0 0 0 0 1 0 0 0 0 ...
## $ EmploymentDuration.lt.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ EmploymentDuration.1.to.4 : num 0 1 0 0 1 1 0 1 0 0 ...
## $ EmploymentDuration.4.to.7 : num 0 0 1 1 0 0 0 0 1 0 ...
## $ EmploymentDuration.gt.7 : num 1 0 0 0 0 0 1 0 0 0 ...
## $ EmploymentDuration.Unemployed : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Personal.Male.Divorced.Seperated : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Personal.Female.NotSingle : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Personal.Male.Single : num 1 0 1 1 1 1 1 1 0 0 ...
## $ Personal.Male.Married.Widowed : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Personal.Female.Single : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherDebtorsGuarantors.None : num 1 1 1 0 1 1 1 1 1 1 ...
## $ OtherDebtorsGuarantors.CoApplicant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherDebtorsGuarantors.Guarantor : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Property.RealEstate : num 1 1 1 0 0 0 0 0 1 0 ...
## $ Property.Insurance : num 0 0 0 1 0 0 1 0 0 0 ...
## $ Property.CarOther : num 0 0 0 0 0 0 0 1 0 1 ...
## $ Property.Unknown : num 0 0 0 0 1 1 0 0 0 0 ...
## $ OtherInstallmentPlans.Bank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherInstallmentPlans.Stores : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherInstallmentPlans.None : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Housing.Rent : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Housing.Own : num 1 1 1 0 0 0 1 0 1 1 ...
## $ Housing.ForFree : num 0 0 0 1 1 1 0 0 0 0 ...
## $ Job.UnemployedUnskilled : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Job.UnskilledResident : num 0 0 1 0 0 1 0 0 1 0 ...
## $ Job.SkilledEmployee : num 1 1 0 1 1 0 1 0 0 0 ...
## $ Job.Management.SelfEmp.HighlyQualified: num 0 0 0 0 0 0 0 1 0 1 ...
summary(GermanCredit)
## Duration Amount InstallmentRatePercentage ResidenceDuration
## Min. : 4.0 Min. : 250 Min. :1.000 Min. :1.000
## 1st Qu.:12.0 1st Qu.: 1366 1st Qu.:2.000 1st Qu.:2.000
## Median :18.0 Median : 2320 Median :3.000 Median :3.000
## Mean :20.9 Mean : 3271 Mean :2.973 Mean :2.845
## 3rd Qu.:24.0 3rd Qu.: 3972 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :72.0 Max. :18424 Max. :4.000 Max. :4.000
## Age NumberExistingCredits NumberPeopleMaintenance Telephone
## Min. :19.00 Min. :1.000 Min. :1.000 Min. :0.000
## 1st Qu.:27.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.000
## Median :33.00 Median :1.000 Median :1.000 Median :1.000
## Mean :35.55 Mean :1.407 Mean :1.155 Mean :0.596
## 3rd Qu.:42.00 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :75.00 Max. :4.000 Max. :2.000 Max. :1.000
## ForeignWorker Class CheckingAccountStatus.lt.0
## Min. :0.000 Bad :300 Min. :0.000
## 1st Qu.:1.000 Good:700 1st Qu.:0.000
## Median :1.000 Median :0.000
## Mean :0.963 Mean :0.274
## 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1.000 Max. :1.000
## CheckingAccountStatus.0.to.200 CheckingAccountStatus.gt.200
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.269 Mean :0.063
## 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## CheckingAccountStatus.none CreditHistory.NoCredit.AllPaid
## Min. :0.000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.00
## Median :0.000 Median :0.00
## Mean :0.394 Mean :0.04
## 3rd Qu.:1.000 3rd Qu.:0.00
## Max. :1.000 Max. :1.00
## CreditHistory.ThisBank.AllPaid CreditHistory.PaidDuly CreditHistory.Delay
## Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :1.00 Median :0.000
## Mean :0.049 Mean :0.53 Mean :0.088
## 3rd Qu.:0.000 3rd Qu.:1.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.00 Max. :1.000
## CreditHistory.Critical Purpose.NewCar Purpose.UsedCar
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.293 Mean :0.234 Mean :0.103
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
## Purpose.Furniture.Equipment Purpose.Radio.Television Purpose.DomesticAppliance
## Min. :0.000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.00 Median :0.000
## Mean :0.181 Mean :0.28 Mean :0.012
## 3rd Qu.:0.000 3rd Qu.:1.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.00 Max. :1.000
## Purpose.Repairs Purpose.Education Purpose.Vacation Purpose.Retraining
## Min. :0.000 Min. :0.00 Min. :0 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0 1st Qu.:0.000
## Median :0.000 Median :0.00 Median :0 Median :0.000
## Mean :0.022 Mean :0.05 Mean :0 Mean :0.009
## 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0 3rd Qu.:0.000
## Max. :1.000 Max. :1.00 Max. :0 Max. :1.000
## Purpose.Business Purpose.Other SavingsAccountBonds.lt.100
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :1.000
## Mean :0.097 Mean :0.012 Mean :0.603
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000
## Max. :1.000 Max. :1.000 Max. :1.000
## SavingsAccountBonds.100.to.500 SavingsAccountBonds.500.to.1000
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.103 Mean :0.063
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## SavingsAccountBonds.gt.1000 SavingsAccountBonds.Unknown
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.048 Mean :0.183
## 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## EmploymentDuration.lt.1 EmploymentDuration.1.to.4 EmploymentDuration.4.to.7
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.172 Mean :0.339 Mean :0.174
## 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
## EmploymentDuration.gt.7 EmploymentDuration.Unemployed
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000
## Mean :0.253 Mean :0.062
## 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## Personal.Male.Divorced.Seperated Personal.Female.NotSingle
## Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.00
## Median :0.00 Median :0.00
## Mean :0.05 Mean :0.31
## 3rd Qu.:0.00 3rd Qu.:1.00
## Max. :1.00 Max. :1.00
## Personal.Male.Single Personal.Male.Married.Widowed Personal.Female.Single
## Min. :0.000 Min. :0.000 Min. :0
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0
## Median :1.000 Median :0.000 Median :0
## Mean :0.548 Mean :0.092 Mean :0
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0
## Max. :1.000 Max. :1.000 Max. :0
## OtherDebtorsGuarantors.None OtherDebtorsGuarantors.CoApplicant
## Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.000
## Median :1.000 Median :0.000
## Mean :0.907 Mean :0.041
## 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000
## OtherDebtorsGuarantors.Guarantor Property.RealEstate Property.Insurance
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.052 Mean :0.282 Mean :0.232
## 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
## Property.CarOther Property.Unknown OtherInstallmentPlans.Bank
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.332 Mean :0.154 Mean :0.139
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
## OtherInstallmentPlans.Stores OtherInstallmentPlans.None Housing.Rent
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.000
## Median :0.000 Median :1.000 Median :0.000
## Mean :0.047 Mean :0.814 Mean :0.179
## 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
## Housing.Own Housing.ForFree Job.UnemployedUnskilled Job.UnskilledResident
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.0
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0
## Median :1.000 Median :0.000 Median :0.000 Median :0.0
## Mean :0.713 Mean :0.108 Mean :0.022 Mean :0.2
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.0
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.0
## Job.SkilledEmployee Job.Management.SelfEmp.HighlyQualified
## Min. :0.00 Min. :0.000
## 1st Qu.:0.00 1st Qu.:0.000
## Median :1.00 Median :0.000
## Mean :0.63 Mean :0.148
## 3rd Qu.:1.00 3rd Qu.:0.000
## Max. :1.00 Max. :1.000
##################################
# Formulating a data type assessment summary
##################################
<- GermanCredit
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 Duration integer
## 2 2 Amount integer
## 3 3 InstallmentRatePercentage integer
## 4 4 ResidenceDuration integer
## 5 5 Age integer
## 6 6 NumberExistingCredits integer
## 7 7 NumberPeopleMaintenance integer
## 8 8 Telephone numeric
## 9 9 ForeignWorker numeric
## 10 10 Class factor
## 11 11 CheckingAccountStatus.lt.0 numeric
## 12 12 CheckingAccountStatus.0.to.200 numeric
## 13 13 CheckingAccountStatus.gt.200 numeric
## 14 14 CheckingAccountStatus.none numeric
## 15 15 CreditHistory.NoCredit.AllPaid numeric
## 16 16 CreditHistory.ThisBank.AllPaid numeric
## 17 17 CreditHistory.PaidDuly numeric
## 18 18 CreditHistory.Delay numeric
## 19 19 CreditHistory.Critical numeric
## 20 20 Purpose.NewCar numeric
## 21 21 Purpose.UsedCar numeric
## 22 22 Purpose.Furniture.Equipment numeric
## 23 23 Purpose.Radio.Television numeric
## 24 24 Purpose.DomesticAppliance numeric
## 25 25 Purpose.Repairs numeric
## 26 26 Purpose.Education numeric
## 27 27 Purpose.Vacation numeric
## 28 28 Purpose.Retraining numeric
## 29 29 Purpose.Business numeric
## 30 30 Purpose.Other numeric
## 31 31 SavingsAccountBonds.lt.100 numeric
## 32 32 SavingsAccountBonds.100.to.500 numeric
## 33 33 SavingsAccountBonds.500.to.1000 numeric
## 34 34 SavingsAccountBonds.gt.1000 numeric
## 35 35 SavingsAccountBonds.Unknown numeric
## 36 36 EmploymentDuration.lt.1 numeric
## 37 37 EmploymentDuration.1.to.4 numeric
## 38 38 EmploymentDuration.4.to.7 numeric
## 39 39 EmploymentDuration.gt.7 numeric
## 40 40 EmploymentDuration.Unemployed numeric
## 41 41 Personal.Male.Divorced.Seperated numeric
## 42 42 Personal.Female.NotSingle numeric
## 43 43 Personal.Male.Single numeric
## 44 44 Personal.Male.Married.Widowed numeric
## 45 45 Personal.Female.Single numeric
## 46 46 OtherDebtorsGuarantors.None numeric
## 47 47 OtherDebtorsGuarantors.CoApplicant numeric
## 48 48 OtherDebtorsGuarantors.Guarantor numeric
## 49 49 Property.RealEstate numeric
## 50 50 Property.Insurance numeric
## 51 51 Property.CarOther numeric
## 52 52 Property.Unknown numeric
## 53 53 OtherInstallmentPlans.Bank numeric
## 54 54 OtherInstallmentPlans.Stores numeric
## 55 55 OtherInstallmentPlans.None numeric
## 56 56 Housing.Rent numeric
## 57 57 Housing.Own numeric
## 58 58 Housing.ForFree numeric
## 59 59 Job.UnemployedUnskilled numeric
## 60 60 Job.UnskilledResident numeric
## 61 61 Job.SkilledEmployee numeric
## 62 62 Job.Management.SelfEmp.HighlyQualified numeric
##################################
# Loading dataset
##################################
<- GermanCredit
DQA
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count
## 1 1 Duration integer 1000
## 2 2 Amount integer 1000
## 3 3 InstallmentRatePercentage integer 1000
## 4 4 ResidenceDuration integer 1000
## 5 5 Age integer 1000
## 6 6 NumberExistingCredits integer 1000
## 7 7 NumberPeopleMaintenance integer 1000
## 8 8 Telephone numeric 1000
## 9 9 ForeignWorker numeric 1000
## 10 10 Class factor 1000
## 11 11 CheckingAccountStatus.lt.0 numeric 1000
## 12 12 CheckingAccountStatus.0.to.200 numeric 1000
## 13 13 CheckingAccountStatus.gt.200 numeric 1000
## 14 14 CheckingAccountStatus.none numeric 1000
## 15 15 CreditHistory.NoCredit.AllPaid numeric 1000
## 16 16 CreditHistory.ThisBank.AllPaid numeric 1000
## 17 17 CreditHistory.PaidDuly numeric 1000
## 18 18 CreditHistory.Delay numeric 1000
## 19 19 CreditHistory.Critical numeric 1000
## 20 20 Purpose.NewCar numeric 1000
## 21 21 Purpose.UsedCar numeric 1000
## 22 22 Purpose.Furniture.Equipment numeric 1000
## 23 23 Purpose.Radio.Television numeric 1000
## 24 24 Purpose.DomesticAppliance numeric 1000
## 25 25 Purpose.Repairs numeric 1000
## 26 26 Purpose.Education numeric 1000
## 27 27 Purpose.Vacation numeric 1000
## 28 28 Purpose.Retraining numeric 1000
## 29 29 Purpose.Business numeric 1000
## 30 30 Purpose.Other numeric 1000
## 31 31 SavingsAccountBonds.lt.100 numeric 1000
## 32 32 SavingsAccountBonds.100.to.500 numeric 1000
## 33 33 SavingsAccountBonds.500.to.1000 numeric 1000
## 34 34 SavingsAccountBonds.gt.1000 numeric 1000
## 35 35 SavingsAccountBonds.Unknown numeric 1000
## 36 36 EmploymentDuration.lt.1 numeric 1000
## 37 37 EmploymentDuration.1.to.4 numeric 1000
## 38 38 EmploymentDuration.4.to.7 numeric 1000
## 39 39 EmploymentDuration.gt.7 numeric 1000
## 40 40 EmploymentDuration.Unemployed numeric 1000
## 41 41 Personal.Male.Divorced.Seperated numeric 1000
## 42 42 Personal.Female.NotSingle numeric 1000
## 43 43 Personal.Male.Single numeric 1000
## 44 44 Personal.Male.Married.Widowed numeric 1000
## 45 45 Personal.Female.Single numeric 1000
## 46 46 OtherDebtorsGuarantors.None numeric 1000
## 47 47 OtherDebtorsGuarantors.CoApplicant numeric 1000
## 48 48 OtherDebtorsGuarantors.Guarantor numeric 1000
## 49 49 Property.RealEstate numeric 1000
## 50 50 Property.Insurance numeric 1000
## 51 51 Property.CarOther numeric 1000
## 52 52 Property.Unknown numeric 1000
## 53 53 OtherInstallmentPlans.Bank numeric 1000
## 54 54 OtherInstallmentPlans.Stores numeric 1000
## 55 55 OtherInstallmentPlans.None numeric 1000
## 56 56 Housing.Rent numeric 1000
## 57 57 Housing.Own numeric 1000
## 58 58 Housing.ForFree numeric 1000
## 59 59 Job.UnemployedUnskilled numeric 1000
## 60 60 Job.UnskilledResident numeric 1000
## 61 61 Job.SkilledEmployee numeric 1000
## 62 62 Job.Management.SelfEmp.HighlyQualified numeric 1000
## NA.Count Fill.Rate
## 1 0 1.000
## 2 0 1.000
## 3 0 1.000
## 4 0 1.000
## 5 0 1.000
## 6 0 1.000
## 7 0 1.000
## 8 0 1.000
## 9 0 1.000
## 10 0 1.000
## 11 0 1.000
## 12 0 1.000
## 13 0 1.000
## 14 0 1.000
## 15 0 1.000
## 16 0 1.000
## 17 0 1.000
## 18 0 1.000
## 19 0 1.000
## 20 0 1.000
## 21 0 1.000
## 22 0 1.000
## 23 0 1.000
## 24 0 1.000
## 25 0 1.000
## 26 0 1.000
## 27 0 1.000
## 28 0 1.000
## 29 0 1.000
## 30 0 1.000
## 31 0 1.000
## 32 0 1.000
## 33 0 1.000
## 34 0 1.000
## 35 0 1.000
## 36 0 1.000
## 37 0 1.000
## 38 0 1.000
## 39 0 1.000
## 40 0 1.000
## 41 0 1.000
## 42 0 1.000
## 43 0 1.000
## 44 0 1.000
## 45 0 1.000
## 46 0 1.000
## 47 0 1.000
## 48 0 1.000
## 49 0 1.000
## 50 0 1.000
## 51 0 1.000
## 52 0 1.000
## 53 0 1.000
## 54 0 1.000
## 55 0 1.000
## 56 0 1.000
## 57 0 1.000
## 58 0 1.000
## 59 0 1.000
## 60 0 1.000
## 61 0 1.000
## 62 0 1.000
##################################
# Listing all predictors
##################################
<- DQA[,!names(DQA) %in% c("Class")]
DQA.Predictors
##################################
# Identifying and converting numeric predictors
# which should have been factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
<- c()
DQA.Predictors.Numeric.Max
for (i in 1:ncol(DQA.Predictors.Numeric)){
<- max(DQA.Predictors.Numeric[,i])
DQA.Predictors.Numeric.Max.i <- append(DQA.Predictors.Numeric.Max,DQA.Predictors.Numeric.Max.i)
DQA.Predictors.Numeric.Max
}
<- as.data.frame(cbind(names(DQA.Predictors.Numeric),DQA.Predictors.Numeric.Max))
DQA.Predictors.Numeric.Max.Summary names(DQA.Predictors.Numeric.Max.Summary) <- c("Numeric.Predictors","Max")
$Max <- as.numeric(as.character(DQA.Predictors.Numeric.Max.Summary$Max))
DQA.Predictors.Numeric.Max.Summary
<- DQA.Predictors.Numeric.Max.Summary[DQA.Predictors.Numeric.Max.Summary$Max<2,]
DQA.Predictors.Numeric.To.Factor <- as.vector(DQA.Predictors.Numeric.To.Factor$Numeric.Predictors)
DQA.Predictors.Numeric.To.Factor.Names
<-lapply(DQA.Predictors[DQA.Predictors.Numeric.To.Factor.Names],factor)
DQA.Predictors[DQA.Predictors.Numeric.To.Factor.Names]
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 7 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are 54 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count
## 1 Telephone factor 2
## 2 ForeignWorker factor 2
## 3 CheckingAccountStatus.lt.0 factor 2
## 4 CheckingAccountStatus.0.to.200 factor 2
## 5 CheckingAccountStatus.gt.200 factor 2
## 6 CheckingAccountStatus.none factor 2
## 7 CreditHistory.NoCredit.AllPaid factor 2
## 8 CreditHistory.ThisBank.AllPaid factor 2
## 9 CreditHistory.PaidDuly factor 2
## 10 CreditHistory.Delay factor 2
## 11 CreditHistory.Critical factor 2
## 12 Purpose.NewCar factor 2
## 13 Purpose.UsedCar factor 2
## 14 Purpose.Furniture.Equipment factor 2
## 15 Purpose.Radio.Television factor 2
## 16 Purpose.DomesticAppliance factor 2
## 17 Purpose.Repairs factor 2
## 18 Purpose.Education factor 2
## 19 Purpose.Vacation factor 1
## 20 Purpose.Retraining factor 2
## 21 Purpose.Business factor 2
## 22 Purpose.Other factor 2
## 23 SavingsAccountBonds.lt.100 factor 2
## 24 SavingsAccountBonds.100.to.500 factor 2
## 25 SavingsAccountBonds.500.to.1000 factor 2
## 26 SavingsAccountBonds.gt.1000 factor 2
## 27 SavingsAccountBonds.Unknown factor 2
## 28 EmploymentDuration.lt.1 factor 2
## 29 EmploymentDuration.1.to.4 factor 2
## 30 EmploymentDuration.4.to.7 factor 2
## 31 EmploymentDuration.gt.7 factor 2
## 32 EmploymentDuration.Unemployed factor 2
## 33 Personal.Male.Divorced.Seperated factor 2
## 34 Personal.Female.NotSingle factor 2
## 35 Personal.Male.Single factor 2
## 36 Personal.Male.Married.Widowed factor 2
## 37 Personal.Female.Single factor 1
## 38 OtherDebtorsGuarantors.None factor 2
## 39 OtherDebtorsGuarantors.CoApplicant factor 2
## 40 OtherDebtorsGuarantors.Guarantor factor 2
## 41 Property.RealEstate factor 2
## 42 Property.Insurance factor 2
## 43 Property.CarOther factor 2
## 44 Property.Unknown factor 2
## 45 OtherInstallmentPlans.Bank factor 2
## 46 OtherInstallmentPlans.Stores factor 2
## 47 OtherInstallmentPlans.None factor 2
## 48 Housing.Rent factor 2
## 49 Housing.Own factor 2
## 50 Housing.ForFree factor 2
## 51 Job.UnemployedUnskilled factor 2
## 52 Job.UnskilledResident factor 2
## 53 Job.SkilledEmployee factor 2
## 54 Job.Management.SelfEmp.HighlyQualified factor 2
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 1 0 596 404
## 2 1 0 963 37
## 3 0 1 726 274
## 4 0 1 731 269
## 5 0 1 937 63
## 6 0 1 606 394
## 7 0 1 960 40
## 8 0 1 951 49
## 9 1 0 530 470
## 10 0 1 912 88
## 11 0 1 707 293
## 12 0 1 766 234
## 13 0 1 897 103
## 14 0 1 819 181
## 15 0 1 720 280
## 16 0 1 988 12
## 17 0 1 978 22
## 18 0 1 950 50
## 19 0 x 1000 0
## 20 0 1 991 9
## 21 0 1 903 97
## 22 0 1 988 12
## 23 1 0 603 397
## 24 0 1 897 103
## 25 0 1 937 63
## 26 0 1 952 48
## 27 0 1 817 183
## 28 0 1 828 172
## 29 0 1 661 339
## 30 0 1 826 174
## 31 0 1 747 253
## 32 0 1 938 62
## 33 0 1 950 50
## 34 0 1 690 310
## 35 1 0 548 452
## 36 0 1 908 92
## 37 0 x 1000 0
## 38 1 0 907 93
## 39 0 1 959 41
## 40 0 1 948 52
## 41 0 1 718 282
## 42 0 1 768 232
## 43 0 1 668 332
## 44 0 1 846 154
## 45 0 1 861 139
## 46 0 1 953 47
## 47 1 0 814 186
## 48 0 1 821 179
## 49 1 0 713 287
## 50 0 1 892 108
## 51 0 1 978 22
## 52 0 1 800 200
## 53 1 0 630 370
## 54 0 1 852 148
## Unique.Count.Ratio First.Second.Mode.Ratio
## 1 0.002 1.475
## 2 0.002 26.027
## 3 0.002 2.650
## 4 0.002 2.717
## 5 0.002 14.873
## 6 0.002 1.538
## 7 0.002 24.000
## 8 0.002 19.408
## 9 0.002 1.128
## 10 0.002 10.364
## 11 0.002 2.413
## 12 0.002 3.274
## 13 0.002 8.709
## 14 0.002 4.525
## 15 0.002 2.571
## 16 0.002 82.333
## 17 0.002 44.455
## 18 0.002 19.000
## 19 0.001 Inf
## 20 0.002 110.111
## 21 0.002 9.309
## 22 0.002 82.333
## 23 0.002 1.519
## 24 0.002 8.709
## 25 0.002 14.873
## 26 0.002 19.833
## 27 0.002 4.464
## 28 0.002 4.814
## 29 0.002 1.950
## 30 0.002 4.747
## 31 0.002 2.953
## 32 0.002 15.129
## 33 0.002 19.000
## 34 0.002 2.226
## 35 0.002 1.212
## 36 0.002 9.870
## 37 0.001 Inf
## 38 0.002 9.753
## 39 0.002 23.390
## 40 0.002 18.231
## 41 0.002 2.546
## 42 0.002 3.310
## 43 0.002 2.012
## 44 0.002 5.494
## 45 0.002 6.194
## 46 0.002 20.277
## 47 0.002 4.376
## 48 0.002 4.587
## 49 0.002 2.484
## 50 0.002 8.259
## 51 0.002 44.455
## 52 0.002 4.000
## 53 0.002 1.703
## 54 0.002 5.757
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 Duration integer 33 0.033
## 2 Amount integer 921 0.921
## 3 InstallmentRatePercentage integer 4 0.004
## 4 ResidenceDuration integer 4 0.004
## 5 Age integer 53 0.053
## 6 NumberExistingCredits integer 4 0.004
## 7 NumberPeopleMaintenance integer 2 0.002
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 24.000 12.000 184 179
## 2 1393.000 1169.000 3 2
## 3 4.000 2.000 476 231
## 4 4.000 2.000 413 308
## 5 27.000 26.000 51 50
## 6 1.000 2.000 633 333
## 7 1.000 2.000 845 155
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.028 4.000 20.903 18.000 72.000 1.093 3.909
## 2 1.500 250.000 3271.258 2319.500 18424.000 1.947 7.265
## 3 2.061 1.000 2.973 3.000 4.000 -0.531 1.790
## 4 1.341 1.000 2.845 3.000 4.000 -0.272 1.619
## 5 1.020 19.000 35.546 33.000 75.000 1.019 3.587
## 6 1.901 1.000 1.407 1.000 4.000 1.271 4.590
## 7 5.452 1.000 1.155 1.000 2.000 1.907 4.635
## Percentile25th Percentile75th
## 1 12.000 24.000
## 2 1365.500 3972.250
## 3 2.000 4.000
## 4 2.000 4.000
## 5 27.000 42.000
## 6 1.000 2.000
## 7 1.000 1.000
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 29 factor variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count
## 2 ForeignWorker factor 2
## 5 CheckingAccountStatus.gt.200 factor 2
## 7 CreditHistory.NoCredit.AllPaid factor 2
## 8 CreditHistory.ThisBank.AllPaid factor 2
## 10 CreditHistory.Delay factor 2
## 13 Purpose.UsedCar factor 2
## 16 Purpose.DomesticAppliance factor 2
## 17 Purpose.Repairs factor 2
## 18 Purpose.Education factor 2
## 19 Purpose.Vacation factor 1
## 20 Purpose.Retraining factor 2
## 21 Purpose.Business factor 2
## 22 Purpose.Other factor 2
## 24 SavingsAccountBonds.100.to.500 factor 2
## 25 SavingsAccountBonds.500.to.1000 factor 2
## 26 SavingsAccountBonds.gt.1000 factor 2
## 32 EmploymentDuration.Unemployed factor 2
## 33 Personal.Male.Divorced.Seperated factor 2
## 36 Personal.Male.Married.Widowed factor 2
## 37 Personal.Female.Single factor 1
## 38 OtherDebtorsGuarantors.None factor 2
## 39 OtherDebtorsGuarantors.CoApplicant factor 2
## 40 OtherDebtorsGuarantors.Guarantor factor 2
## 44 Property.Unknown factor 2
## 45 OtherInstallmentPlans.Bank factor 2
## 46 OtherInstallmentPlans.Stores factor 2
## 50 Housing.ForFree factor 2
## 51 Job.UnemployedUnskilled factor 2
## 54 Job.Management.SelfEmp.HighlyQualified factor 2
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 2 1 0 963 37
## 5 0 1 937 63
## 7 0 1 960 40
## 8 0 1 951 49
## 10 0 1 912 88
## 13 0 1 897 103
## 16 0 1 988 12
## 17 0 1 978 22
## 18 0 1 950 50
## 19 0 x 1000 0
## 20 0 1 991 9
## 21 0 1 903 97
## 22 0 1 988 12
## 24 0 1 897 103
## 25 0 1 937 63
## 26 0 1 952 48
## 32 0 1 938 62
## 33 0 1 950 50
## 36 0 1 908 92
## 37 0 x 1000 0
## 38 1 0 907 93
## 39 0 1 959 41
## 40 0 1 948 52
## 44 0 1 846 154
## 45 0 1 861 139
## 46 0 1 953 47
## 50 0 1 892 108
## 51 0 1 978 22
## 54 0 1 852 148
## Unique.Count.Ratio First.Second.Mode.Ratio
## 2 0.002 26.027
## 5 0.002 14.873
## 7 0.002 24.000
## 8 0.002 19.408
## 10 0.002 10.364
## 13 0.002 8.709
## 16 0.002 82.333
## 17 0.002 44.455
## 18 0.002 19.000
## 19 0.001 Inf
## 20 0.002 110.111
## 21 0.002 9.309
## 22 0.002 82.333
## 24 0.002 8.709
## 25 0.002 14.873
## 26 0.002 19.833
## 32 0.002 15.129
## 33 0.002 19.000
## 36 0.002 9.870
## 37 0.001 Inf
## 38 0.002 9.753
## 39 0.002 23.390
## 40 0.002 18.231
## 44 0.002 5.494
## 45 0.002 6.194
## 46 0.002 20.277
## 50 0.002 8.259
## 51 0.002 44.455
## 54 0.002 5.757
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 1 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 7 NumberPeopleMaintenance integer 2 0.002
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 7 1.000 2.000 845 155
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 7 5.452 1.000 1.155 1.000 2.000 1.907 4.635
## Percentile25th Percentile75th
## 7 1.000 1.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "Low variance observed for 4 numeric variable(s) with Unique.Count.Ratio<0.01."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 3 InstallmentRatePercentage integer 4 0.004
## 4 ResidenceDuration integer 4 0.004
## 6 NumberExistingCredits integer 4 0.004
## 7 NumberPeopleMaintenance integer 2 0.002
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 3 4.000 2.000 476 231
## 4 4.000 2.000 413 308
## 6 1.000 2.000 633 333
## 7 1.000 2.000 845 155
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 3 2.061 1.000 2.973 3.000 4.000 -0.531 1.790
## 4 1.341 1.000 2.845 3.000 4.000 -0.272 1.619
## 6 1.901 1.000 1.407 1.000 4.000 1.271 4.590
## 7 5.452 1.000 1.155 1.000 2.000 1.907 4.635
## Percentile25th Percentile75th
## 3 2.000 4.000
## 4 2.000 4.000
## 6 1.000 2.000
## 7 1.000 1.000
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
##################################
# Loading dataset
##################################
<- GermanCredit
DPA
##################################
# Listing all predictors
##################################
<- DQA.Predictors
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors.Numeric
DPA.Predictors.Numeric
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA.Predictors.Numeric[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "5 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 1000 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.00 | 72 | ▇▇▂▁▁ |
Amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.25 | 18424 | ▇▂▁▁▁ |
InstallmentRatePercentage | 0 | 1 | 2.97 | 1.12 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▃▁▂▇ |
ResidenceDuration | 0 | 1 | 2.85 | 1.10 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▆▁▃▇ |
Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.00 | 75 | ▇▆▃▁▁ |
NumberExistingCredits | 0 | 1 | 1.41 | 0.58 | 1 | 1.0 | 1.0 | 2.00 | 4 | ▇▅▁▁▁ |
NumberPeopleMaintenance | 0 | 1 | 1.16 | 0.36 | 1 | 1.0 | 1.0 | 1.00 | 2 | ▇▁▁▁▂ |
##################################
# Reusing dataset
##################################
##################################
# Identifying and converting numeric variables
# which should have been factor variables
##################################
<- DQA[,sapply(DQA, is.numeric)]
DQA.Numeric
<- c()
DQA.Numeric.Max
for (i in 1:ncol(DQA.Numeric)){
<- max(DQA.Numeric[,i])
DQA.Numeric.Max.i <- append(DQA.Numeric.Max,DQA.Numeric.Max.i)
DQA.Numeric.Max
}
<- as.data.frame(cbind(names(DQA.Numeric),DQA.Numeric.Max))
DQA.Numeric.Max.Summary names(DQA.Numeric.Max.Summary) <- c("Numeric.Predictors","Max")
$Max <- as.numeric(as.character(DQA.Numeric.Max.Summary$Max))
DQA.Numeric.Max.Summary
<- DQA.Numeric.Max.Summary[DQA.Numeric.Max.Summary$Max<2,]
DQA.Numeric.To.Factor <- as.vector(DQA.Numeric.To.Factor$Numeric.Predictors)
DQA.Numeric.To.Factor.Names
<-lapply(DQA[DQA.Numeric.To.Factor.Names],factor)
DQA[DQA.Numeric.To.Factor.Names]
<- DQA
DPA
# Gathering descriptive statistics
##################################
<- skim(DQA)) (DPA_Skimmed
Name | DQA |
Number of rows | 1000 |
Number of columns | 62 |
_______________________ | |
Column type frequency: | |
factor | 55 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Telephone | 0 | 1 | FALSE | 2 | 1: 596, 0: 404 |
ForeignWorker | 0 | 1 | FALSE | 2 | 1: 963, 0: 37 |
Class | 0 | 1 | FALSE | 2 | Goo: 700, Bad: 300 |
CheckingAccountStatus.lt.0 | 0 | 1 | FALSE | 2 | 0: 726, 1: 274 |
CheckingAccountStatus.0.to.200 | 0 | 1 | FALSE | 2 | 0: 731, 1: 269 |
CheckingAccountStatus.gt.200 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
CheckingAccountStatus.none | 0 | 1 | FALSE | 2 | 0: 606, 1: 394 |
CreditHistory.NoCredit.AllPaid | 0 | 1 | FALSE | 2 | 0: 960, 1: 40 |
CreditHistory.ThisBank.AllPaid | 0 | 1 | FALSE | 2 | 0: 951, 1: 49 |
CreditHistory.PaidDuly | 0 | 1 | FALSE | 2 | 1: 530, 0: 470 |
CreditHistory.Delay | 0 | 1 | FALSE | 2 | 0: 912, 1: 88 |
CreditHistory.Critical | 0 | 1 | FALSE | 2 | 0: 707, 1: 293 |
Purpose.NewCar | 0 | 1 | FALSE | 2 | 0: 766, 1: 234 |
Purpose.UsedCar | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
Purpose.Furniture.Equipment | 0 | 1 | FALSE | 2 | 0: 819, 1: 181 |
Purpose.Radio.Television | 0 | 1 | FALSE | 2 | 0: 720, 1: 280 |
Purpose.DomesticAppliance | 0 | 1 | FALSE | 2 | 0: 988, 1: 12 |
Purpose.Repairs | 0 | 1 | FALSE | 2 | 0: 978, 1: 22 |
Purpose.Education | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Purpose.Vacation | 0 | 1 | FALSE | 1 | 0: 1000 |
Purpose.Retraining | 0 | 1 | FALSE | 2 | 0: 991, 1: 9 |
Purpose.Business | 0 | 1 | FALSE | 2 | 0: 903, 1: 97 |
Purpose.Other | 0 | 1 | FALSE | 2 | 0: 988, 1: 12 |
SavingsAccountBonds.lt.100 | 0 | 1 | FALSE | 2 | 1: 603, 0: 397 |
SavingsAccountBonds.100.to.500 | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
SavingsAccountBonds.500.to.1000 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
SavingsAccountBonds.gt.1000 | 0 | 1 | FALSE | 2 | 0: 952, 1: 48 |
SavingsAccountBonds.Unknown | 0 | 1 | FALSE | 2 | 0: 817, 1: 183 |
EmploymentDuration.lt.1 | 0 | 1 | FALSE | 2 | 0: 828, 1: 172 |
EmploymentDuration.1.to.4 | 0 | 1 | FALSE | 2 | 0: 661, 1: 339 |
EmploymentDuration.4.to.7 | 0 | 1 | FALSE | 2 | 0: 826, 1: 174 |
EmploymentDuration.gt.7 | 0 | 1 | FALSE | 2 | 0: 747, 1: 253 |
EmploymentDuration.Unemployed | 0 | 1 | FALSE | 2 | 0: 938, 1: 62 |
Personal.Male.Divorced.Seperated | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Personal.Female.NotSingle | 0 | 1 | FALSE | 2 | 0: 690, 1: 310 |
Personal.Male.Single | 0 | 1 | FALSE | 2 | 1: 548, 0: 452 |
Personal.Male.Married.Widowed | 0 | 1 | FALSE | 2 | 0: 908, 1: 92 |
Personal.Female.Single | 0 | 1 | FALSE | 1 | 0: 1000 |
OtherDebtorsGuarantors.None | 0 | 1 | FALSE | 2 | 1: 907, 0: 93 |
OtherDebtorsGuarantors.CoApplicant | 0 | 1 | FALSE | 2 | 0: 959, 1: 41 |
OtherDebtorsGuarantors.Guarantor | 0 | 1 | FALSE | 2 | 0: 948, 1: 52 |
Property.RealEstate | 0 | 1 | FALSE | 2 | 0: 718, 1: 282 |
Property.Insurance | 0 | 1 | FALSE | 2 | 0: 768, 1: 232 |
Property.CarOther | 0 | 1 | FALSE | 2 | 0: 668, 1: 332 |
Property.Unknown | 0 | 1 | FALSE | 2 | 0: 846, 1: 154 |
OtherInstallmentPlans.Bank | 0 | 1 | FALSE | 2 | 0: 861, 1: 139 |
OtherInstallmentPlans.Stores | 0 | 1 | FALSE | 2 | 0: 953, 1: 47 |
OtherInstallmentPlans.None | 0 | 1 | FALSE | 2 | 1: 814, 0: 186 |
Housing.Rent | 0 | 1 | FALSE | 2 | 0: 821, 1: 179 |
Housing.Own | 0 | 1 | FALSE | 2 | 1: 713, 0: 287 |
Housing.ForFree | 0 | 1 | FALSE | 2 | 0: 892, 1: 108 |
Job.UnemployedUnskilled | 0 | 1 | FALSE | 2 | 0: 978, 1: 22 |
Job.UnskilledResident | 0 | 1 | FALSE | 2 | 0: 800, 1: 200 |
Job.SkilledEmployee | 0 | 1 | FALSE | 2 | 1: 630, 0: 370 |
Job.Management.SelfEmp.HighlyQualified | 0 | 1 | FALSE | 2 | 0: 852, 1: 148 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.00 | 72 | ▇▇▂▁▁ |
Amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.25 | 18424 | ▇▂▁▁▁ |
InstallmentRatePercentage | 0 | 1 | 2.97 | 1.12 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▃▁▂▇ |
ResidenceDuration | 0 | 1 | 2.85 | 1.10 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▆▁▃▇ |
Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.00 | 75 | ▇▆▃▁▁ |
NumberExistingCredits | 0 | 1 | 1.41 | 0.58 | 1 | 1.0 | 1.0 | 2.00 | 4 | ▇▅▁▁▁ |
NumberPeopleMaintenance | 0 | 1 | 1.16 | 0.36 | 1 | 1.0 | 1.0 | 1.00 | 2 | ▇▁▁▁▂ |
##################################
# Identifying columns with low variance
###################################
<- nearZeroVar(DPA,
DPA_LowVariance freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
$nzv,]) (DPA_LowVariance[DPA_LowVariance
## freqRatio percentUnique zeroVar nzv
## ForeignWorker 26.02703 0.2 FALSE TRUE
## CreditHistory.NoCredit.AllPaid 24.00000 0.2 FALSE TRUE
## CreditHistory.ThisBank.AllPaid 19.40816 0.2 FALSE TRUE
## Purpose.DomesticAppliance 82.33333 0.2 FALSE TRUE
## Purpose.Repairs 44.45455 0.2 FALSE TRUE
## Purpose.Vacation 0.00000 0.1 TRUE TRUE
## Purpose.Retraining 110.11111 0.2 FALSE TRUE
## Purpose.Other 82.33333 0.2 FALSE TRUE
## SavingsAccountBonds.gt.1000 19.83333 0.2 FALSE TRUE
## Personal.Female.Single 0.00000 0.1 TRUE TRUE
## OtherDebtorsGuarantors.CoApplicant 23.39024 0.2 FALSE TRUE
## OtherInstallmentPlans.Stores 20.27660 0.2 FALSE TRUE
## Job.UnemployedUnskilled 44.45455 0.2 FALSE TRUE
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
else {
}
print(paste0("Low variance observed for ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
<- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
DPA_LowVarianceForRemoval
print(paste0("Low variance can be resolved by removing ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
<- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
DPA_LowVarianceRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LowVarianceRemovedVariable))
}
%>%
DPA skim() %>%
::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
dplyr
##################################
# Filtering out columns with low variance
#################################
<- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
<- DPA_ExcludedLowVariance
PMA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLowVariance))
(DPA_ExcludedLowVariance_Skimmed }
## [1] "Low variance observed for 13 numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."
## [1] "Low variance can be resolved by removing 13 numeric variable(s)."
## [1] "Variable 1 for removal: ForeignWorker"
## [1] "Variable 2 for removal: CreditHistory.NoCredit.AllPaid"
## [1] "Variable 3 for removal: CreditHistory.ThisBank.AllPaid"
## [1] "Variable 4 for removal: Purpose.DomesticAppliance"
## [1] "Variable 5 for removal: Purpose.Repairs"
## [1] "Variable 6 for removal: Purpose.Vacation"
## [1] "Variable 7 for removal: Purpose.Retraining"
## [1] "Variable 8 for removal: Purpose.Other"
## [1] "Variable 9 for removal: SavingsAccountBonds.gt.1000"
## [1] "Variable 10 for removal: Personal.Female.Single"
## [1] "Variable 11 for removal: OtherDebtorsGuarantors.CoApplicant"
## [1] "Variable 12 for removal: OtherInstallmentPlans.Stores"
## [1] "Variable 13 for removal: Job.UnemployedUnskilled"
Name | DPA_ExcludedLowVariance |
Number of rows | 1000 |
Number of columns | 49 |
_______________________ | |
Column type frequency: | |
factor | 42 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Telephone | 0 | 1 | FALSE | 2 | 1: 596, 0: 404 |
Class | 0 | 1 | FALSE | 2 | Goo: 700, Bad: 300 |
CheckingAccountStatus.lt.0 | 0 | 1 | FALSE | 2 | 0: 726, 1: 274 |
CheckingAccountStatus.0.to.200 | 0 | 1 | FALSE | 2 | 0: 731, 1: 269 |
CheckingAccountStatus.gt.200 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
CheckingAccountStatus.none | 0 | 1 | FALSE | 2 | 0: 606, 1: 394 |
CreditHistory.PaidDuly | 0 | 1 | FALSE | 2 | 1: 530, 0: 470 |
CreditHistory.Delay | 0 | 1 | FALSE | 2 | 0: 912, 1: 88 |
CreditHistory.Critical | 0 | 1 | FALSE | 2 | 0: 707, 1: 293 |
Purpose.NewCar | 0 | 1 | FALSE | 2 | 0: 766, 1: 234 |
Purpose.UsedCar | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
Purpose.Furniture.Equipment | 0 | 1 | FALSE | 2 | 0: 819, 1: 181 |
Purpose.Radio.Television | 0 | 1 | FALSE | 2 | 0: 720, 1: 280 |
Purpose.Education | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Purpose.Business | 0 | 1 | FALSE | 2 | 0: 903, 1: 97 |
SavingsAccountBonds.lt.100 | 0 | 1 | FALSE | 2 | 1: 603, 0: 397 |
SavingsAccountBonds.100.to.500 | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
SavingsAccountBonds.500.to.1000 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
SavingsAccountBonds.Unknown | 0 | 1 | FALSE | 2 | 0: 817, 1: 183 |
EmploymentDuration.lt.1 | 0 | 1 | FALSE | 2 | 0: 828, 1: 172 |
EmploymentDuration.1.to.4 | 0 | 1 | FALSE | 2 | 0: 661, 1: 339 |
EmploymentDuration.4.to.7 | 0 | 1 | FALSE | 2 | 0: 826, 1: 174 |
EmploymentDuration.gt.7 | 0 | 1 | FALSE | 2 | 0: 747, 1: 253 |
EmploymentDuration.Unemployed | 0 | 1 | FALSE | 2 | 0: 938, 1: 62 |
Personal.Male.Divorced.Seperated | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Personal.Female.NotSingle | 0 | 1 | FALSE | 2 | 0: 690, 1: 310 |
Personal.Male.Single | 0 | 1 | FALSE | 2 | 1: 548, 0: 452 |
Personal.Male.Married.Widowed | 0 | 1 | FALSE | 2 | 0: 908, 1: 92 |
OtherDebtorsGuarantors.None | 0 | 1 | FALSE | 2 | 1: 907, 0: 93 |
OtherDebtorsGuarantors.Guarantor | 0 | 1 | FALSE | 2 | 0: 948, 1: 52 |
Property.RealEstate | 0 | 1 | FALSE | 2 | 0: 718, 1: 282 |
Property.Insurance | 0 | 1 | FALSE | 2 | 0: 768, 1: 232 |
Property.CarOther | 0 | 1 | FALSE | 2 | 0: 668, 1: 332 |
Property.Unknown | 0 | 1 | FALSE | 2 | 0: 846, 1: 154 |
OtherInstallmentPlans.Bank | 0 | 1 | FALSE | 2 | 0: 861, 1: 139 |
OtherInstallmentPlans.None | 0 | 1 | FALSE | 2 | 1: 814, 0: 186 |
Housing.Rent | 0 | 1 | FALSE | 2 | 0: 821, 1: 179 |
Housing.Own | 0 | 1 | FALSE | 2 | 1: 713, 0: 287 |
Housing.ForFree | 0 | 1 | FALSE | 2 | 0: 892, 1: 108 |
Job.UnskilledResident | 0 | 1 | FALSE | 2 | 0: 800, 1: 200 |
Job.SkilledEmployee | 0 | 1 | FALSE | 2 | 1: 630, 0: 370 |
Job.Management.SelfEmp.HighlyQualified | 0 | 1 | FALSE | 2 | 0: 852, 1: 148 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.00 | 72 | ▇▇▂▁▁ |
Amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.25 | 18424 | ▇▂▁▁▁ |
InstallmentRatePercentage | 0 | 1 | 2.97 | 1.12 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▃▁▂▇ |
ResidenceDuration | 0 | 1 | 2.85 | 1.10 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▆▁▃▇ |
Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.00 | 75 | ▇▆▃▁▁ |
NumberExistingCredits | 0 | 1 | 1.41 | 0.58 | 1 | 1.0 | 1.0 | 2.00 | 4 | ▇▅▁▁▁ |
NumberPeopleMaintenance | 0 | 1 | 1.16 | 0.36 | 1 | 1.0 | 1.0 | 1.00 | 2 | ▇▁▁▁▂ |
##################################
# Reusing dataset
##################################
<- DPA_ExcludedLowVariance[,sapply(DPA_ExcludedLowVariance, is.numeric)]
DQA.Numeric_ExcludedLowVariance <- DQA.Numeric_ExcludedLowVariance
DPA.Predictors.Numeric
##################################
# Visualizing pairwise correlation between predictors
##################################
<- cor.mtest(DPA.Predictors.Numeric,
DPA_CorrelationTest method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")
##################################
# Identifying the highly correlated variables
##################################
<- cor(DPA.Predictors.Numeric,
DPA_Correlation method = "pearson",
use="pairwise.complete.obs")
<- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95)) (DPA_HighlyCorrelatedCount
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
else {
} print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount)," pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
<- corr_cross(DPA.Predictors.Numeric,
(DPA_HighlyCorrelatedPairs max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
<- findCorrelation(DPA_Correlation, cutoff = 0.95)
DPA_HighlyCorrelated
<- length(DPA_HighlyCorrelated))
(DPA_HighlyCorrelatedForRemoval
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval)," numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
DPA_HighlyCorrelatedRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
<- DPA[,-DPA_HighlyCorrelated]
DPA_ExcludedHighCorrelation
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedHighCorrelation))
(DPA_ExcludedHighCorrelation_Skimmed
}
##################################
# Loading dataset
##################################
<- GermanCredit
DPA <- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
##################################
# Listing all predictors
##################################
<- DPA_ExcludedLowVariance[,!names(DPA_ExcludedLowVariance) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Finding linear dependencies
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
##################################
# Identifying the linearly dependent variables
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$linearCombos)) (DPA_LinearlyDependentCount
## [1] 4
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
else {
} print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount)," subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
DPA_LinearlyDependentSubset print(paste0("Linear dependent variable(s) for subset ",
i," include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "Linear dependency observed for 4 subset(s) of numeric variable(s)."
## [1] "Linear dependent variable(s) for subset 1 include: EmploymentDuration.Unemployed"
## [2] "Linear dependent variable(s) for subset 1 include: CheckingAccountStatus.lt.0"
## [3] "Linear dependent variable(s) for subset 1 include: CheckingAccountStatus.0.to.200"
## [4] "Linear dependent variable(s) for subset 1 include: CheckingAccountStatus.gt.200"
## [5] "Linear dependent variable(s) for subset 1 include: CheckingAccountStatus.none"
## [6] "Linear dependent variable(s) for subset 1 include: EmploymentDuration.lt.1"
## [7] "Linear dependent variable(s) for subset 1 include: EmploymentDuration.1.to.4"
## [8] "Linear dependent variable(s) for subset 1 include: EmploymentDuration.4.to.7"
## [9] "Linear dependent variable(s) for subset 1 include: EmploymentDuration.gt.7"
## [1] "Linear dependent variable(s) for subset 2 include: Personal.Male.Married.Widowed"
## [2] "Linear dependent variable(s) for subset 2 include: CheckingAccountStatus.lt.0"
## [3] "Linear dependent variable(s) for subset 2 include: CheckingAccountStatus.0.to.200"
## [4] "Linear dependent variable(s) for subset 2 include: CheckingAccountStatus.gt.200"
## [5] "Linear dependent variable(s) for subset 2 include: CheckingAccountStatus.none"
## [6] "Linear dependent variable(s) for subset 2 include: Personal.Male.Divorced.Seperated"
## [7] "Linear dependent variable(s) for subset 2 include: Personal.Female.NotSingle"
## [8] "Linear dependent variable(s) for subset 2 include: Personal.Male.Single"
## [1] "Linear dependent variable(s) for subset 3 include: Property.Unknown"
## [2] "Linear dependent variable(s) for subset 3 include: CheckingAccountStatus.lt.0"
## [3] "Linear dependent variable(s) for subset 3 include: CheckingAccountStatus.0.to.200"
## [4] "Linear dependent variable(s) for subset 3 include: CheckingAccountStatus.gt.200"
## [5] "Linear dependent variable(s) for subset 3 include: CheckingAccountStatus.none"
## [6] "Linear dependent variable(s) for subset 3 include: Property.RealEstate"
## [7] "Linear dependent variable(s) for subset 3 include: Property.Insurance"
## [8] "Linear dependent variable(s) for subset 3 include: Property.CarOther"
## [1] "Linear dependent variable(s) for subset 4 include: Housing.ForFree"
## [2] "Linear dependent variable(s) for subset 4 include: CheckingAccountStatus.lt.0"
## [3] "Linear dependent variable(s) for subset 4 include: CheckingAccountStatus.0.to.200"
## [4] "Linear dependent variable(s) for subset 4 include: CheckingAccountStatus.gt.200"
## [5] "Linear dependent variable(s) for subset 4 include: CheckingAccountStatus.none"
## [6] "Linear dependent variable(s) for subset 4 include: Housing.Rent"
## [7] "Linear dependent variable(s) for subset 4 include: Housing.Own"
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$remove)
DPA_LinearlyDependentForRemoval
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval)," numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
DPA_LinearlyDependentRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
<- DPA.Predictors.Numeric[,-DPA_LinearlyDependent$remove]
DPA_ExcludedLinearlyDependent
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLinearlyDependent))
(DPA_ExcludedLinearlyDependent_Skimmed
}
## [1] "Linear dependency can be resolved by removing 4 numeric variable(s)."
## [1] "Variable 1 for removal: EmploymentDuration.Unemployed"
## [1] "Variable 2 for removal: Personal.Male.Married.Widowed"
## [1] "Variable 3 for removal: Property.Unknown"
## [1] "Variable 4 for removal: Housing.ForFree"
Name | DPA_ExcludedLinearlyDepen… |
Number of rows | 1000 |
Number of columns | 44 |
_______________________ | |
Column type frequency: | |
numeric | 44 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.00 | 72 | ▇▇▂▁▁ |
Amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.25 | 18424 | ▇▂▁▁▁ |
InstallmentRatePercentage | 0 | 1 | 2.97 | 1.12 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▃▁▂▇ |
ResidenceDuration | 0 | 1 | 2.85 | 1.10 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▆▁▃▇ |
Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.00 | 75 | ▇▆▃▁▁ |
NumberExistingCredits | 0 | 1 | 1.41 | 0.58 | 1 | 1.0 | 1.0 | 2.00 | 4 | ▇▅▁▁▁ |
NumberPeopleMaintenance | 0 | 1 | 1.16 | 0.36 | 1 | 1.0 | 1.0 | 1.00 | 2 | ▇▁▁▁▂ |
Telephone | 0 | 1 | 0.60 | 0.49 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▆▁▁▁▇ |
CheckingAccountStatus.lt.0 | 0 | 1 | 0.27 | 0.45 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
CheckingAccountStatus.0.to.200 | 0 | 1 | 0.27 | 0.44 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
CheckingAccountStatus.gt.200 | 0 | 1 | 0.06 | 0.24 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
CheckingAccountStatus.none | 0 | 1 | 0.39 | 0.49 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▅ |
CreditHistory.PaidDuly | 0 | 1 | 0.53 | 0.50 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▇▁▁▁▇ |
CreditHistory.Delay | 0 | 1 | 0.09 | 0.28 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
CreditHistory.Critical | 0 | 1 | 0.29 | 0.46 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
Purpose.NewCar | 0 | 1 | 0.23 | 0.42 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
Purpose.UsedCar | 0 | 1 | 0.10 | 0.30 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
Purpose.Furniture.Equipment | 0 | 1 | 0.18 | 0.39 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
Purpose.Radio.Television | 0 | 1 | 0.28 | 0.45 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
Purpose.Education | 0 | 1 | 0.05 | 0.22 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
Purpose.Business | 0 | 1 | 0.10 | 0.30 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
SavingsAccountBonds.lt.100 | 0 | 1 | 0.60 | 0.49 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▅▁▁▁▇ |
SavingsAccountBonds.100.to.500 | 0 | 1 | 0.10 | 0.30 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
SavingsAccountBonds.500.to.1000 | 0 | 1 | 0.06 | 0.24 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
SavingsAccountBonds.Unknown | 0 | 1 | 0.18 | 0.39 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
EmploymentDuration.lt.1 | 0 | 1 | 0.17 | 0.38 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
EmploymentDuration.1.to.4 | 0 | 1 | 0.34 | 0.47 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▅ |
EmploymentDuration.4.to.7 | 0 | 1 | 0.17 | 0.38 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
EmploymentDuration.gt.7 | 0 | 1 | 0.25 | 0.43 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
Personal.Male.Divorced.Seperated | 0 | 1 | 0.05 | 0.22 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
Personal.Female.NotSingle | 0 | 1 | 0.31 | 0.46 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
Personal.Male.Single | 0 | 1 | 0.55 | 0.50 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▆▁▁▁▇ |
OtherDebtorsGuarantors.None | 0 | 1 | 0.91 | 0.29 | 0 | 1.0 | 1.0 | 1.00 | 1 | ▁▁▁▁▇ |
OtherDebtorsGuarantors.Guarantor | 0 | 1 | 0.05 | 0.22 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
Property.RealEstate | 0 | 1 | 0.28 | 0.45 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
Property.Insurance | 0 | 1 | 0.23 | 0.42 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
Property.CarOther | 0 | 1 | 0.33 | 0.47 | 0 | 0.0 | 0.0 | 1.00 | 1 | ▇▁▁▁▃ |
OtherInstallmentPlans.Bank | 0 | 1 | 0.14 | 0.35 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
OtherInstallmentPlans.None | 0 | 1 | 0.81 | 0.39 | 0 | 1.0 | 1.0 | 1.00 | 1 | ▂▁▁▁▇ |
Housing.Rent | 0 | 1 | 0.18 | 0.38 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
Housing.Own | 0 | 1 | 0.71 | 0.45 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▃▁▁▁▇ |
Job.UnskilledResident | 0 | 1 | 0.20 | 0.40 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
Job.SkilledEmployee | 0 | 1 | 0.63 | 0.48 | 0 | 0.0 | 1.0 | 1.00 | 1 | ▅▁▁▁▇ |
Job.Management.SelfEmp.HighlyQualified | 0 | 1 | 0.15 | 0.36 | 0 | 0.0 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
##################################
# Creating the pre-modelling dataset
#################################
<- PMA
PMA_ExcludedLowVariance
##################################
# Filtering out columns with linear dependencies
# from the dataset with low-variance columns already filtered
# to create the pre-modelling dataset
#################################
$CheckingAccountStatus.lt.0 <- NULL
PMA_ExcludedLowVariance$EmploymentDuration.Unemployed <- NULL
PMA_ExcludedLowVariance$Personal.Male.Married.Widowed <- NULL
PMA_ExcludedLowVariance$Property.Unknown <- NULL
PMA_ExcludedLowVariance$Housing.ForFree <- NULL
PMA_ExcludedLowVariance
<- PMA_ExcludedLowVariance
PMA_ExcludedLowVariance_ExcludedLinearlyDependent
<- PMA_ExcludedLowVariance_ExcludedLinearlyDependent
PMA_PreModelling
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling)) (PMA_PreModelling_Skimmed
Name | PMA_PreModelling |
Number of rows | 1000 |
Number of columns | 44 |
_______________________ | |
Column type frequency: | |
factor | 37 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Telephone | 0 | 1 | FALSE | 2 | 1: 596, 0: 404 |
Class | 0 | 1 | FALSE | 2 | Goo: 700, Bad: 300 |
CheckingAccountStatus.0.to.200 | 0 | 1 | FALSE | 2 | 0: 731, 1: 269 |
CheckingAccountStatus.gt.200 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
CheckingAccountStatus.none | 0 | 1 | FALSE | 2 | 0: 606, 1: 394 |
CreditHistory.PaidDuly | 0 | 1 | FALSE | 2 | 1: 530, 0: 470 |
CreditHistory.Delay | 0 | 1 | FALSE | 2 | 0: 912, 1: 88 |
CreditHistory.Critical | 0 | 1 | FALSE | 2 | 0: 707, 1: 293 |
Purpose.NewCar | 0 | 1 | FALSE | 2 | 0: 766, 1: 234 |
Purpose.UsedCar | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
Purpose.Furniture.Equipment | 0 | 1 | FALSE | 2 | 0: 819, 1: 181 |
Purpose.Radio.Television | 0 | 1 | FALSE | 2 | 0: 720, 1: 280 |
Purpose.Education | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Purpose.Business | 0 | 1 | FALSE | 2 | 0: 903, 1: 97 |
SavingsAccountBonds.lt.100 | 0 | 1 | FALSE | 2 | 1: 603, 0: 397 |
SavingsAccountBonds.100.to.500 | 0 | 1 | FALSE | 2 | 0: 897, 1: 103 |
SavingsAccountBonds.500.to.1000 | 0 | 1 | FALSE | 2 | 0: 937, 1: 63 |
SavingsAccountBonds.Unknown | 0 | 1 | FALSE | 2 | 0: 817, 1: 183 |
EmploymentDuration.lt.1 | 0 | 1 | FALSE | 2 | 0: 828, 1: 172 |
EmploymentDuration.1.to.4 | 0 | 1 | FALSE | 2 | 0: 661, 1: 339 |
EmploymentDuration.4.to.7 | 0 | 1 | FALSE | 2 | 0: 826, 1: 174 |
EmploymentDuration.gt.7 | 0 | 1 | FALSE | 2 | 0: 747, 1: 253 |
Personal.Male.Divorced.Seperated | 0 | 1 | FALSE | 2 | 0: 950, 1: 50 |
Personal.Female.NotSingle | 0 | 1 | FALSE | 2 | 0: 690, 1: 310 |
Personal.Male.Single | 0 | 1 | FALSE | 2 | 1: 548, 0: 452 |
OtherDebtorsGuarantors.None | 0 | 1 | FALSE | 2 | 1: 907, 0: 93 |
OtherDebtorsGuarantors.Guarantor | 0 | 1 | FALSE | 2 | 0: 948, 1: 52 |
Property.RealEstate | 0 | 1 | FALSE | 2 | 0: 718, 1: 282 |
Property.Insurance | 0 | 1 | FALSE | 2 | 0: 768, 1: 232 |
Property.CarOther | 0 | 1 | FALSE | 2 | 0: 668, 1: 332 |
OtherInstallmentPlans.Bank | 0 | 1 | FALSE | 2 | 0: 861, 1: 139 |
OtherInstallmentPlans.None | 0 | 1 | FALSE | 2 | 1: 814, 0: 186 |
Housing.Rent | 0 | 1 | FALSE | 2 | 0: 821, 1: 179 |
Housing.Own | 0 | 1 | FALSE | 2 | 1: 713, 0: 287 |
Job.UnskilledResident | 0 | 1 | FALSE | 2 | 0: 800, 1: 200 |
Job.SkilledEmployee | 0 | 1 | FALSE | 2 | 1: 630, 0: 370 |
Job.Management.SelfEmp.HighlyQualified | 0 | 1 | FALSE | 2 | 0: 852, 1: 148 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Duration | 0 | 1 | 20.90 | 12.06 | 4 | 12.0 | 18.0 | 24.00 | 72 | ▇▇▂▁▁ |
Amount | 0 | 1 | 3271.26 | 2822.74 | 250 | 1365.5 | 2319.5 | 3972.25 | 18424 | ▇▂▁▁▁ |
InstallmentRatePercentage | 0 | 1 | 2.97 | 1.12 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▃▁▂▇ |
ResidenceDuration | 0 | 1 | 2.85 | 1.10 | 1 | 2.0 | 3.0 | 4.00 | 4 | ▂▆▁▃▇ |
Age | 0 | 1 | 35.55 | 11.38 | 19 | 27.0 | 33.0 | 42.00 | 75 | ▇▆▃▁▁ |
NumberExistingCredits | 0 | 1 | 1.41 | 0.58 | 1 | 1.0 | 1.0 | 2.00 | 4 | ▇▅▁▁▁ |
NumberPeopleMaintenance | 0 | 1 | 1.16 | 0.36 | 1 | 1.0 | 1.0 | 1.00 | 2 | ▇▁▁▁▂ |
##################################
# Loading dataset
##################################
<- PMA_PreModelling
EDA
##################################
# Listing all predictors
##################################
<- EDA[,!names(EDA) %in% c("Class")]
EDA.Predictors
##################################
# Listing all numeric predictors
##################################
<- EDA.Predictors[,sapply(EDA.Predictors, is.numeric)]
EDA.Predictors.Numeric ncol(EDA.Predictors.Numeric)
## [1] 7
names(EDA.Predictors.Numeric)
## [1] "Duration" "Amount"
## [3] "InstallmentRatePercentage" "ResidenceDuration"
## [5] "Age" "NumberExistingCredits"
## [7] "NumberPeopleMaintenance"
##################################
# Converting response variable data type to factor
##################################
$Class <- as.factor(EDA$Class)
EDAlength(levels(EDA$Class))
## [1] 2
##################################
# Formulating the box plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(3, 3))
##################################
# Formulating the density plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(3, 3),
auto.key = list(columns = (length(levels(EDA$Class)))))
##################################
# Listing all factor predictors
##################################
<- EDA.Predictors[,sapply(EDA.Predictors, is.factor)]
EDA.Predictors.Factor ncol(EDA.Predictors.Factor)
## [1] 36
names(EDA.Predictors.Factor)
## [1] "Telephone"
## [2] "CheckingAccountStatus.0.to.200"
## [3] "CheckingAccountStatus.gt.200"
## [4] "CheckingAccountStatus.none"
## [5] "CreditHistory.PaidDuly"
## [6] "CreditHistory.Delay"
## [7] "CreditHistory.Critical"
## [8] "Purpose.NewCar"
## [9] "Purpose.UsedCar"
## [10] "Purpose.Furniture.Equipment"
## [11] "Purpose.Radio.Television"
## [12] "Purpose.Education"
## [13] "Purpose.Business"
## [14] "SavingsAccountBonds.lt.100"
## [15] "SavingsAccountBonds.100.to.500"
## [16] "SavingsAccountBonds.500.to.1000"
## [17] "SavingsAccountBonds.Unknown"
## [18] "EmploymentDuration.lt.1"
## [19] "EmploymentDuration.1.to.4"
## [20] "EmploymentDuration.4.to.7"
## [21] "EmploymentDuration.gt.7"
## [22] "Personal.Male.Divorced.Seperated"
## [23] "Personal.Female.NotSingle"
## [24] "Personal.Male.Single"
## [25] "OtherDebtorsGuarantors.None"
## [26] "OtherDebtorsGuarantors.Guarantor"
## [27] "Property.RealEstate"
## [28] "Property.Insurance"
## [29] "Property.CarOther"
## [30] "OtherInstallmentPlans.Bank"
## [31] "OtherInstallmentPlans.None"
## [32] "Housing.Rent"
## [33] "Housing.Own"
## [34] "Job.UnskilledResident"
## [35] "Job.SkilledEmployee"
## [36] "Job.Management.SelfEmp.HighlyQualified"
##################################
# Formulating the proportion tables
##################################
<- names(EDA.Predictors.Factor)[1]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_1 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[2]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_2 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[3]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_3 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[4]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_4 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[5]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_5 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[6]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_6 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[7]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_7 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[8]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_8 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[9]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_9 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[10]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_10 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[11]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_11 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[12]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_12 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[13]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_13 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[14]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_14 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[15]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_15 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[16]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_16 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[17]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_17 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[18]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_18 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[19]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_19 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[20]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_20 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[21]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_21 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[22]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_22 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[23]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_23 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[24]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_24 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[25]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_25 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[26]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_26 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[27]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_27 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[28]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_28 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[29]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_29 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[30]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_30 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[31]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_31 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[32]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_32 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[33]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_33 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[34]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_34 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[35]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_35 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
<- names(EDA.Predictors.Factor)[36]
Prop_Label <- paste0(Prop_Label,"_PropTable")
PropTable_Label <- EDA[,c("Class",Prop_Label)]
PropTable_Data <- (assign(PropTable_Label,as.data.frame(prop.table(table(PropTable_Data), 2))))
PropTable_Summary $Variable <- rep(Prop_Label,nrow(PropTable_Summary))
PropTable_Summary$Category <- PropTable_Summary[,2]
PropTable_Summary
<- barchart(Freq ~ Category | Variable,
PropTable_BarChart_36 data=PropTable_Summary,
groups = Class,
stack=TRUE,
ylab = "Proportion",
auto.key = list(adj=1, space="top", columns=2))
grid.arrange(PropTable_BarChart_1,
PropTable_BarChart_2,
PropTable_BarChart_3,
PropTable_BarChart_4,
PropTable_BarChart_5,
PropTable_BarChart_6,
PropTable_BarChart_7,
PropTable_BarChart_8,
PropTable_BarChart_9, ncol = 3)
grid.arrange(PropTable_BarChart_10,
PropTable_BarChart_11,
PropTable_BarChart_12,
PropTable_BarChart_13,
PropTable_BarChart_14,
PropTable_BarChart_15,
PropTable_BarChart_16,
PropTable_BarChart_17,
PropTable_BarChart_18, ncol = 3)
grid.arrange(PropTable_BarChart_19,
PropTable_BarChart_20,
PropTable_BarChart_21,
PropTable_BarChart_22,
PropTable_BarChart_23,
PropTable_BarChart_24,
PropTable_BarChart_25,
PropTable_BarChart_26,
PropTable_BarChart_27,ncol = 3)
grid.arrange(PropTable_BarChart_28,
PropTable_BarChart_29,
PropTable_BarChart_30,
PropTable_BarChart_31,
PropTable_BarChart_32,
PropTable_BarChart_33,
PropTable_BarChart_34,
PropTable_BarChart_35,
PropTable_BarChart_36, ncol = 3)
##################################
# Creating the pre-modelling dataset
# into the train and test sets
##################################
set.seed(12345678)
<- createDataPartition(PMA_PreModelling$Class,p=0.8)[[1]]
MA_Train_Index <- PMA_PreModelling[ MA_Train_Index, ]
MA_Train <- PMA_PreModelling[-MA_Train_Index, ] MA_Test
##################################
# Creating the modelling dataset
##################################
<- MA_Train
MA_Train.Evaluated <- MA_Test
MA_Test.Evaluated
##################################
# Formulating the RPART model
# using a complexity parameter setting
# equal to 0.001
##################################
.001 = rpart(Class ~ .,
rpartFit.Apparent.CPdata = MA_Train,
control = rpart.control(cp = 0.001))
printcp(rpartFit.Apparent.CP.001)
##
## Classification tree:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] Age Amount
## [3] CheckingAccountStatus.0.to.200 CheckingAccountStatus.gt.200
## [5] CheckingAccountStatus.none CreditHistory.Critical
## [7] Duration EmploymentDuration.4.to.7
## [9] EmploymentDuration.gt.7 InstallmentRatePercentage
## [11] OtherDebtorsGuarantors.Guarantor OtherInstallmentPlans.None
## [13] Property.CarOther Property.Insurance
## [15] Property.RealEstate Purpose.Business
## [17] Purpose.NewCar Purpose.UsedCar
## [19] SavingsAccountBonds.lt.100
##
## Root node error: 240/800 = 0.3
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.0291667 0 1.00000 1.00000 0.054006
## 2 0.0270833 5 0.84167 1.10417 0.055468
## 3 0.0208333 7 0.78750 1.06667 0.054975
## 4 0.0166667 8 0.76667 1.05417 0.054802
## 5 0.0141667 9 0.75000 1.03750 0.054566
## 6 0.0097222 15 0.65833 1.01250 0.054197
## 7 0.0083333 21 0.60000 1.00000 0.054006
## 8 0.0052083 24 0.57500 0.97917 0.053679
## 9 0.0041667 28 0.55417 0.97917 0.053679
## 10 0.0010000 33 0.53333 0.98750 0.053811
##################################
# Pruning the model
##################################
001.Pruned <- prune(rpartFit.Apparent.CP.001, cp = 0.001)
rpartFit.Apparent.CP.
001.Pruned.Summary <- summary(rpartFit.Apparent.CP.001.Pruned) rpartFit.Apparent.CP.
## Call:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.001))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.029166667 0 1.0000000 1.0000000 0.05400617
## 2 0.027083333 5 0.8416667 1.1041667 0.05546814
## 3 0.020833333 7 0.7875000 1.0666667 0.05497474
## 4 0.016666667 8 0.7666667 1.0541667 0.05480216
## 5 0.014166667 9 0.7500000 1.0375000 0.05456564
## 6 0.009722222 15 0.6583333 1.0125000 0.05419691
## 7 0.008333333 21 0.6000000 1.0000000 0.05400617
## 8 0.005208333 24 0.5750000 0.9791667 0.05367869
## 9 0.004166667 28 0.5541667 0.9791667 0.05367869
## 10 0.001000000 33 0.5333333 0.9875000 0.05381113
##
## Variable importance
## CheckingAccountStatus.none Duration
## 17 12
## Amount SavingsAccountBonds.lt.100
## 11 6
## Age CheckingAccountStatus.0.to.200
## 6 5
## CreditHistory.Critical SavingsAccountBonds.Unknown
## 4 3
## SavingsAccountBonds.100.to.500 Property.RealEstate
## 3 3
## OtherDebtorsGuarantors.Guarantor InstallmentRatePercentage
## 2 2
## Purpose.NewCar OtherDebtorsGuarantors.None
## 2 2
## EmploymentDuration.4.to.7 Property.CarOther
## 2 2
## Property.Insurance OtherInstallmentPlans.None
## 2 2
## EmploymentDuration.1.to.4 Purpose.UsedCar
## 2 1
## Purpose.Business SavingsAccountBonds.500.to.1000
## 1 1
## EmploymentDuration.gt.7 CheckingAccountStatus.gt.200
## 1 1
## NumberExistingCredits OtherInstallmentPlans.Bank
## 1 1
## CreditHistory.PaidDuly Job.SkilledEmployee
## 1 1
## Purpose.Furniture.Equipment Purpose.Radio.Television
## 1 1
## Telephone Job.UnskilledResident
## 1 1
##
## Node number 1: 800 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.3 P(node) =1
## class counts: 240 560
## probabilities: 0.300 0.700
## left son=2 (484 obs) right son=3 (316 obs)
## Primary splits:
## CheckingAccountStatus.none splits as LR, improve=36.169470, (0 missing)
## CreditHistory.Critical splits as LR, improve=14.535700, (0 missing)
## Amount < 10918 to the right, improve=13.690600, (0 missing)
## Duration < 26.5 to the right, improve=10.957470, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve= 9.839849, (0 missing)
## Surrogate splits:
## CheckingAccountStatus.0.to.200 splits as RL, agree=0.660, adj=0.139, (0 split)
## CreditHistory.Critical splits as LR, agree=0.627, adj=0.057, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.626, adj=0.054, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.624, adj=0.047, (0 split)
## SavingsAccountBonds.lt.100 splits as RL, agree=0.623, adj=0.044, (0 split)
##
## Node number 2: 484 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4214876 P(node) =0.605
## class counts: 204 280
## probabilities: 0.421 0.579
## left son=4 (404 obs) right son=5 (80 obs)
## Primary splits:
## Duration < 11.5 to the right, improve=9.403355, (0 missing)
## Amount < 10841.5 to the right, improve=9.011413, (0 missing)
## CreditHistory.Critical splits as LR, improve=8.753453, (0 missing)
## Property.RealEstate splits as LR, improve=5.148565, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.821222, (0 missing)
## Surrogate splits:
## Age < 66.5 to the left, agree=0.845, adj=0.063, (0 split)
## Amount < 527.5 to the right, agree=0.841, adj=0.038, (0 split)
##
## Node number 3: 316 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.1139241 P(node) =0.395
## class counts: 36 280
## probabilities: 0.114 0.886
## left son=6 (49 obs) right son=7 (267 obs)
## Primary splits:
## OtherInstallmentPlans.None splits as LR, improve=3.422937, (0 missing)
## OtherInstallmentPlans.Bank splits as RL, improve=2.474148, (0 missing)
## Purpose.Business splits as RL, improve=2.463815, (0 missing)
## Amount < 3891 to the right, improve=2.187268, (0 missing)
## Age < 33.5 to the left, improve=2.068899, (0 missing)
## Surrogate splits:
## OtherInstallmentPlans.Bank splits as RL, agree=0.953, adj=0.694, (0 split)
## Duration < 45 to the right, agree=0.854, adj=0.061, (0 split)
##
## Node number 4: 404 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4653465 P(node) =0.505
## class counts: 188 216
## probabilities: 0.465 0.535
## left son=8 (20 obs) right son=9 (384 obs)
## Primary splits:
## Amount < 10841.5 to the right, improve=6.226578, (0 missing)
## Duration < 47.5 to the right, improve=5.986296, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.939718, (0 missing)
## CreditHistory.Critical splits as LR, improve=4.354026, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=4.296602, (0 missing)
##
## Node number 5: 80 observations, complexity param=0.004166667
## predicted class=Good expected loss=0.2 P(node) =0.1
## class counts: 16 64
## probabilities: 0.200 0.800
## left son=10 (42 obs) right son=11 (38 obs)
## Primary splits:
## Property.RealEstate splits as LR, improve=3.143860, (0 missing)
## Job.Management.SelfEmp.HighlyQualified splits as RL, improve=2.541176, (0 missing)
## NumberExistingCredits < 1.5 to the left, improve=2.325275, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.325275, (0 missing)
## Personal.Female.NotSingle splits as RL, improve=1.449057, (0 missing)
## Surrogate splits:
## Property.Insurance splits as RL, agree=0.725, adj=0.421, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.700, adj=0.368, (0 split)
## Job.UnskilledResident splits as LR, agree=0.688, adj=0.342, (0 split)
## Telephone splits as LR, agree=0.650, adj=0.263, (0 split)
## Property.CarOther splits as RL, agree=0.637, adj=0.237, (0 split)
##
## Node number 6: 49 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.2857143 P(node) =0.06125
## class counts: 14 35
## probabilities: 0.286 0.714
## left son=12 (34 obs) right son=13 (15 obs)
## Primary splits:
## EmploymentDuration.gt.7 splits as LR, improve=2.074510, (0 missing)
## Age < 43.5 to the left, improve=2.051282, (0 missing)
## Purpose.Business splits as RL, improve=1.913876, (0 missing)
## Purpose.Radio.Television splits as LR, improve=1.800000, (0 missing)
## EmploymentDuration.1.to.4 splits as RL, improve=1.779412, (0 missing)
## Surrogate splits:
## Age < 41.5 to the left, agree=0.837, adj=0.467, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.776, adj=0.267, (0 split)
## Property.Insurance splits as LR, agree=0.755, adj=0.200, (0 split)
## NumberExistingCredits < 2.5 to the left, agree=0.735, adj=0.133, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.735, adj=0.133, (0 split)
##
## Node number 7: 267 observations
## predicted class=Good expected loss=0.082397 P(node) =0.33375
## class counts: 22 245
## probabilities: 0.082 0.918
##
## Node number 8: 20 observations
## predicted class=Bad expected loss=0.15 P(node) =0.025
## class counts: 17 3
## probabilities: 0.850 0.150
##
## Node number 9: 384 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4453125 P(node) =0.48
## class counts: 171 213
## probabilities: 0.445 0.555
## left son=18 (88 obs) right son=19 (296 obs)
## Primary splits:
## Purpose.NewCar splits as RL, improve=4.114059, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.110862, (0 missing)
## Amount < 1381.5 to the left, improve=4.104405, (0 missing)
## Duration < 47.5 to the right, improve=4.095170, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.024666, (0 missing)
##
## Node number 10: 42 observations, complexity param=0.004166667
## predicted class=Good expected loss=0.3333333 P(node) =0.0525
## class counts: 14 28
## probabilities: 0.333 0.667
## left son=20 (30 obs) right son=21 (12 obs)
## Primary splits:
## CreditHistory.Critical splits as LR, improve=3.733333, (0 missing)
## NumberExistingCredits < 1.5 to the left, improve=2.880952, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.429167, (0 missing)
## EmploymentDuration.lt.1 splits as RL, improve=1.341153, (0 missing)
## Duration < 9.5 to the left, improve=1.131313, (0 missing)
## Surrogate splits:
## NumberExistingCredits < 1.5 to the left, agree=0.810, adj=0.333, (0 split)
## CreditHistory.PaidDuly splits as RL, agree=0.762, adj=0.167, (0 split)
## EmploymentDuration.4.to.7 splits as LR, agree=0.762, adj=0.167, (0 split)
## Amount < 1048.5 to the right, agree=0.738, adj=0.083, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.738, adj=0.083, (0 split)
##
## Node number 11: 38 observations
## predicted class=Good expected loss=0.05263158 P(node) =0.0475
## class counts: 2 36
## probabilities: 0.053 0.947
##
## Node number 12: 34 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.3823529 P(node) =0.0425
## class counts: 13 21
## probabilities: 0.382 0.618
## left son=24 (24 obs) right son=25 (10 obs)
## Primary splits:
## EmploymentDuration.4.to.7 splits as LR, improve=2.2588240, (0 missing)
## Purpose.Business splits as RL, improve=1.9788240, (0 missing)
## CreditHistory.Critical splits as RL, improve=1.9424210, (0 missing)
## Amount < 2190.5 to the right, improve=1.7254900, (0 missing)
## NumberExistingCredits < 1.5 to the right, improve=0.8366013, (0 missing)
## Surrogate splits:
## EmploymentDuration.1.to.4 splits as RL, agree=0.794, adj=0.3, (0 split)
## Amount < 8797.5 to the left, agree=0.735, adj=0.1, (0 split)
## Age < 24.5 to the right, agree=0.735, adj=0.1, (0 split)
## Property.CarOther splits as LR, agree=0.735, adj=0.1, (0 split)
##
## Node number 13: 15 observations
## predicted class=Good expected loss=0.06666667 P(node) =0.01875
## class counts: 1 14
## probabilities: 0.067 0.933
##
## Node number 18: 88 observations, complexity param=0.02708333
## predicted class=Bad expected loss=0.4204545 P(node) =0.11
## class counts: 51 37
## probabilities: 0.580 0.420
## left son=36 (33 obs) right son=37 (55 obs)
## Primary splits:
## Amount < 1392 to the left, improve=4.583333, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.207792, (0 missing)
## Age < 29.5 to the left, improve=2.506499, (0 missing)
## Duration < 22.5 to the right, improve=2.480381, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.376720, (0 missing)
## Surrogate splits:
## Duration < 13 to the left, agree=0.693, adj=0.182, (0 split)
## Age < 45.5 to the right, agree=0.670, adj=0.121, (0 split)
## Property.RealEstate splits as RL, agree=0.659, adj=0.091, (0 split)
## Job.UnskilledResident splits as RL, agree=0.636, adj=0.030, (0 split)
##
## Node number 19: 296 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4054054 P(node) =0.37
## class counts: 120 176
## probabilities: 0.405 0.595
## left son=38 (26 obs) right son=39 (270 obs)
## Primary splits:
## Duration < 46.5 to the right, improve=4.692446, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=3.771026, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.897053, (0 missing)
## Amount < 4038.5 to the right, improve=2.660474, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.495429, (0 missing)
##
## Node number 20: 30 observations, complexity param=0.004166667
## predicted class=Good expected loss=0.4666667 P(node) =0.0375
## class counts: 14 16
## probabilities: 0.467 0.533
## left son=40 (15 obs) right son=41 (15 obs)
## Primary splits:
## Property.Insurance splits as LR, improve=1.0666670, (0 missing)
## Housing.Own splits as LR, improve=1.0285710, (0 missing)
## Personal.Female.NotSingle splits as RL, improve=1.0147810, (0 missing)
## ResidenceDuration < 3.5 to the right, improve=0.5444444, (0 missing)
## CreditHistory.PaidDuly splits as LR, improve=0.5333333, (0 missing)
## Surrogate splits:
## Property.CarOther splits as RL, agree=0.767, adj=0.533, (0 split)
## Amount < 1398 to the right, agree=0.700, adj=0.400, (0 split)
## EmploymentDuration.gt.7 splits as RL, agree=0.700, adj=0.400, (0 split)
## Duration < 7.5 to the right, agree=0.667, adj=0.333, (0 split)
## Age < 23.5 to the right, agree=0.667, adj=0.333, (0 split)
##
## Node number 21: 12 observations
## predicted class=Good expected loss=0 P(node) =0.015
## class counts: 0 12
## probabilities: 0.000 1.000
##
## Node number 24: 24 observations, complexity param=0.005208333
## predicted class=Bad expected loss=0.5 P(node) =0.03
## class counts: 12 12
## probabilities: 0.500 0.500
## left son=48 (7 obs) right son=49 (17 obs)
## Primary splits:
## Purpose.Business splits as RL, improve=2.521008, (0 missing)
## Duration < 16.5 to the right, improve=1.500000, (0 missing)
## Amount < 2190.5 to the right, improve=1.500000, (0 missing)
## Telephone splits as LR, improve=1.500000, (0 missing)
## Age < 35.5 to the right, improve=0.907563, (0 missing)
## Surrogate splits:
## Job.Management.SelfEmp.HighlyQualified splits as RL, agree=0.792, adj=0.286, (0 split)
## Duration < 42 to the right, agree=0.750, adj=0.143, (0 split)
## Age < 35.5 to the right, agree=0.750, adj=0.143, (0 split)
## SavingsAccountBonds.500.to.1000 splits as RL, agree=0.750, adj=0.143, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.750, adj=0.143, (0 split)
##
## Node number 25: 10 observations
## predicted class=Good expected loss=0.1 P(node) =0.0125
## class counts: 1 9
## probabilities: 0.100 0.900
##
## Node number 36: 33 observations
## predicted class=Bad expected loss=0.2121212 P(node) =0.04125
## class counts: 26 7
## probabilities: 0.788 0.212
##
## Node number 37: 55 observations, complexity param=0.02708333
## predicted class=Good expected loss=0.4545455 P(node) =0.06875
## class counts: 25 30
## probabilities: 0.455 0.545
## left son=74 (26 obs) right son=75 (29 obs)
## Primary splits:
## Duration < 22.5 to the right, improve=3.917290, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.563050, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.629870, (0 missing)
## Age < 29.5 to the left, improve=1.878706, (0 missing)
## Amount < 3904.5 to the right, improve=1.838554, (0 missing)
## Surrogate splits:
## Amount < 2674.5 to the right, agree=0.655, adj=0.269, (0 split)
## InstallmentRatePercentage < 3.5 to the right, agree=0.655, adj=0.269, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.618, adj=0.192, (0 split)
## EmploymentDuration.4.to.7 splits as RL, agree=0.618, adj=0.192, (0 split)
## Age < 27.5 to the left, agree=0.600, adj=0.154, (0 split)
##
## Node number 38: 26 observations, complexity param=0.01666667
## predicted class=Bad expected loss=0.3076923 P(node) =0.0325
## class counts: 18 8
## probabilities: 0.692 0.308
## left son=76 (18 obs) right son=77 (8 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=4.521368, (0 missing)
## Personal.Male.Single splits as LR, improve=2.188034, (0 missing)
## ResidenceDuration < 3.5 to the right, improve=2.155711, (0 missing)
## EmploymentDuration.gt.7 splits as RL, improve=1.813765, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=1.230769, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.Unknown splits as LR, agree=0.885, adj=0.625, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.808, adj=0.375, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.808, adj=0.375, (0 split)
## Job.SkilledEmployee splits as RL, agree=0.769, adj=0.250, (0 split)
## Amount < 4143.5 to the right, agree=0.731, adj=0.125, (0 split)
##
## Node number 39: 270 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.3777778 P(node) =0.3375
## class counts: 102 168
## probabilities: 0.378 0.622
## left son=78 (249 obs) right son=79 (21 obs)
## Primary splits:
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.964314, (0 missing)
## Purpose.UsedCar splits as LR, improve=3.008333, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.748558, (0 missing)
## OtherDebtorsGuarantors.None splits as RL, improve=2.377253, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.202868, (0 missing)
## Surrogate splits:
## OtherDebtorsGuarantors.None splits as RL, agree=0.963, adj=0.524, (0 split)
##
## Node number 40: 15 observations
## predicted class=Bad expected loss=0.4 P(node) =0.01875
## class counts: 9 6
## probabilities: 0.600 0.400
##
## Node number 41: 15 observations
## predicted class=Good expected loss=0.3333333 P(node) =0.01875
## class counts: 5 10
## probabilities: 0.333 0.667
##
## Node number 48: 7 observations
## predicted class=Bad expected loss=0.1428571 P(node) =0.00875
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 49: 17 observations
## predicted class=Good expected loss=0.3529412 P(node) =0.02125
## class counts: 6 11
## probabilities: 0.353 0.647
##
## Node number 74: 26 observations, complexity param=0.02083333
## predicted class=Bad expected loss=0.3461538 P(node) =0.0325
## class counts: 17 9
## probabilities: 0.654 0.346
## left son=148 (17 obs) right son=149 (9 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=5.128708, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.769231, (0 missing)
## Age < 29 to the left, improve=2.484382, (0 missing)
## Telephone splits as RL, improve=1.207139, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.207139, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.885, adj=0.667, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.769, adj=0.333, (0 split)
## Duration < 47.5 to the left, agree=0.731, adj=0.222, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.731, adj=0.222, (0 split)
## Amount < 5897 to the left, agree=0.692, adj=0.111, (0 split)
##
## Node number 75: 29 observations
## predicted class=Good expected loss=0.2758621 P(node) =0.03625
## class counts: 8 21
## probabilities: 0.276 0.724
##
## Node number 76: 18 observations
## predicted class=Bad expected loss=0.1111111 P(node) =0.0225
## class counts: 16 2
## probabilities: 0.889 0.111
##
## Node number 77: 8 observations
## predicted class=Good expected loss=0.25 P(node) =0.01
## class counts: 2 6
## probabilities: 0.250 0.750
##
## Node number 78: 249 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4056225 P(node) =0.31125
## class counts: 101 148
## probabilities: 0.406 0.594
## left son=156 (221 obs) right son=157 (28 obs)
## Primary splits:
## Purpose.UsedCar splits as LR, improve=3.252686, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.954286, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=2.289768, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.211870, (0 missing)
## Amount < 1367.5 to the left, improve=2.089978, (0 missing)
## Surrogate splits:
## Amount < 8877 to the left, agree=0.892, adj=0.036, (0 split)
##
## Node number 79: 21 observations
## predicted class=Good expected loss=0.04761905 P(node) =0.02625
## class counts: 1 20
## probabilities: 0.048 0.952
##
## Node number 148: 17 observations
## predicted class=Bad expected loss=0.1176471 P(node) =0.02125
## class counts: 15 2
## probabilities: 0.882 0.118
##
## Node number 149: 9 observations
## predicted class=Good expected loss=0.2222222 P(node) =0.01125
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 156: 221 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4343891 P(node) =0.27625
## class counts: 96 125
## probabilities: 0.434 0.566
## left son=312 (192 obs) right son=313 (29 obs)
## Primary splits:
## CheckingAccountStatus.gt.200 splits as LR, improve=2.487012, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.102047, (0 missing)
## Duration < 25.5 to the right, improve=1.712852, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.665913, (0 missing)
## Amount < 4038.5 to the right, improve=1.635174, (0 missing)
##
## Node number 157: 28 observations, complexity param=0.004166667
## predicted class=Good expected loss=0.1785714 P(node) =0.035
## class counts: 5 23
## probabilities: 0.179 0.821
## left son=314 (7 obs) right son=315 (21 obs)
## Primary splits:
## Amount < 7284 to the right, improve=2.8809520, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=1.3392860, (0 missing)
## Job.SkilledEmployee splits as RL, improve=0.8458647, (0 missing)
## Job.Management.SelfEmp.HighlyQualified splits as LR, improve=0.8458647, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=0.8091575, (0 missing)
##
## Node number 312: 192 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4635417 P(node) =0.24
## class counts: 89 103
## probabilities: 0.464 0.536
## left son=624 (104 obs) right son=625 (88 obs)
## Primary splits:
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.547276, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.533350, (0 missing)
## Duration < 28.5 to the right, improve=2.331440, (0 missing)
## Amount < 4038.5 to the right, improve=1.903117, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.898893, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.lt.100 splits as RL, agree=0.646, adj=0.227, (0 split)
## Age < 31.5 to the left, agree=0.625, adj=0.182, (0 split)
## Purpose.Business splits as LR, agree=0.609, adj=0.148, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.604, adj=0.136, (0 split)
## Purpose.Furniture.Equipment splits as RL, agree=0.589, adj=0.102, (0 split)
##
## Node number 313: 29 observations, complexity param=0.004166667
## predicted class=Good expected loss=0.2413793 P(node) =0.03625
## class counts: 7 22
## probabilities: 0.241 0.759
## left son=626 (7 obs) right son=627 (22 obs)
## Primary splits:
## Property.RealEstate splits as RL, improve=2.0103000, (0 missing)
## CreditHistory.PaidDuly splits as LR, improve=1.0762450, (0 missing)
## Amount < 2878 to the left, improve=1.0226500, (0 missing)
## Age < 39.5 to the left, improve=0.8025078, (0 missing)
## Duration < 22.5 to the left, improve=0.7254516, (0 missing)
## Surrogate splits:
## Age < 53 to the right, agree=0.793, adj=0.143, (0 split)
## CreditHistory.Delay splits as RL, agree=0.793, adj=0.143, (0 split)
## OtherDebtorsGuarantors.None splits as LR, agree=0.793, adj=0.143, (0 split)
##
## Node number 314: 7 observations
## predicted class=Bad expected loss=0.4285714 P(node) =0.00875
## class counts: 4 3
## probabilities: 0.571 0.429
##
## Node number 315: 21 observations
## predicted class=Good expected loss=0.04761905 P(node) =0.02625
## class counts: 1 20
## probabilities: 0.048 0.952
##
## Node number 624: 104 observations, complexity param=0.01416667
## predicted class=Bad expected loss=0.4615385 P(node) =0.13
## class counts: 56 48
## probabilities: 0.538 0.462
## left son=1248 (73 obs) right son=1249 (31 obs)
## Primary splits:
## Duration < 16.5 to the right, improve=2.978211, (0 missing)
## Amount < 4276.5 to the right, improve=1.692308, (0 missing)
## CreditHistory.Critical splits as LR, improve=1.398866, (0 missing)
## Age < 54 to the left, improve=1.258265, (0 missing)
## Purpose.Business splits as LR, improve=1.258265, (0 missing)
## Surrogate splits:
## Amount < 1119.5 to the right, agree=0.827, adj=0.419, (0 split)
##
## Node number 625: 88 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.375 P(node) =0.11
## class counts: 33 55
## probabilities: 0.375 0.625
## left son=1250 (9 obs) right son=1251 (79 obs)
## Primary splits:
## Age < 48.5 to the right, improve=3.252813, (0 missing)
## Job.Management.SelfEmp.HighlyQualified splits as RL, improve=2.475000, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.964286, (0 missing)
## Amount < 2168.5 to the left, improve=1.424663, (0 missing)
## Personal.Male.Single splits as LR, improve=1.281056, (0 missing)
##
## Node number 626: 7 observations
## predicted class=Bad expected loss=0.4285714 P(node) =0.00875
## class counts: 4 3
## probabilities: 0.571 0.429
##
## Node number 627: 22 observations
## predicted class=Good expected loss=0.1363636 P(node) =0.0275
## class counts: 3 19
## probabilities: 0.136 0.864
##
## Node number 1248: 73 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.3835616 P(node) =0.09125
## class counts: 45 28
## probabilities: 0.616 0.384
## left son=2496 (20 obs) right son=2497 (53 obs)
## Primary splits:
## Amount < 2178.5 to the left, improve=1.8563970, (0 missing)
## Duration < 31.5 to the right, improve=1.5753730, (0 missing)
## Age < 42.5 to the left, improve=1.1462310, (0 missing)
## Purpose.Business splits as LR, improve=1.0474710, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=0.9428169, (0 missing)
##
## Node number 1249: 31 observations, complexity param=0.008333333
## predicted class=Good expected loss=0.3548387 P(node) =0.03875
## class counts: 11 20
## probabilities: 0.355 0.645
## left son=2498 (22 obs) right son=2499 (9 obs)
## Primary splits:
## InstallmentRatePercentage < 3.5 to the right, improve=3.1935480, (0 missing)
## Purpose.Furniture.Equipment splits as LR, improve=2.8865310, (0 missing)
## Amount < 938.5 to the left, improve=1.7745010, (0 missing)
## Duration < 12.5 to the left, improve=1.1392010, (0 missing)
## Housing.Own splits as LR, improve=0.8251273, (0 missing)
## Surrogate splits:
## Amount < 1559 to the left, agree=0.774, adj=0.222, (0 split)
## Purpose.Furniture.Equipment splits as LR, agree=0.774, adj=0.222, (0 split)
## OtherDebtorsGuarantors.None splits as RL, agree=0.774, adj=0.222, (0 split)
## CreditHistory.Delay splits as LR, agree=0.742, adj=0.111, (0 split)
## Personal.Male.Divorced.Seperated splits as LR, agree=0.742, adj=0.111, (0 split)
##
## Node number 1250: 9 observations
## predicted class=Bad expected loss=0.2222222 P(node) =0.01125
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 1251: 79 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.3291139 P(node) =0.09875
## class counts: 26 53
## probabilities: 0.329 0.671
## left son=2502 (44 obs) right son=2503 (35 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=2.095167, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.937309, (0 missing)
## Personal.Male.Single splits as LR, improve=1.774107, (0 missing)
## Amount < 2168.5 to the left, improve=1.531237, (0 missing)
## EmploymentDuration.lt.1 splits as RL, improve=1.045725, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.734, adj=0.400, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.709, adj=0.343, (0 split)
## CreditHistory.PaidDuly splits as LR, agree=0.646, adj=0.200, (0 split)
## Housing.Rent splits as LR, agree=0.620, adj=0.143, (0 split)
## Age < 27.5 to the right, agree=0.608, adj=0.114, (0 split)
##
## Node number 2496: 20 observations
## predicted class=Bad expected loss=0.2 P(node) =0.025
## class counts: 16 4
## probabilities: 0.800 0.200
##
## Node number 2497: 53 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.4528302 P(node) =0.06625
## class counts: 29 24
## probabilities: 0.547 0.453
## left son=4994 (13 obs) right son=4995 (40 obs)
## Primary splits:
## Duration < 31.5 to the right, improve=1.6987660, (0 missing)
## Amount < 2452 to the right, improve=1.1026600, (0 missing)
## Age < 22.5 to the right, improve=1.1026600, (0 missing)
## Property.CarOther splits as LR, improve=0.9784367, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=0.9177457, (0 missing)
## Surrogate splits:
## CreditHistory.Critical splits as RL, agree=0.830, adj=0.308, (0 split)
## OtherDebtorsGuarantors.None splits as LR, agree=0.774, adj=0.077, (0 split)
##
## Node number 2498: 22 observations, complexity param=0.008333333
## predicted class=Bad expected loss=0.5 P(node) =0.0275
## class counts: 11 11
## probabilities: 0.500 0.500
## left son=4996 (10 obs) right son=4997 (12 obs)
## Primary splits:
## Age < 27.5 to the right, improve=1.4666670, (0 missing)
## Personal.Female.NotSingle splits as LR, improve=1.4666670, (0 missing)
## Amount < 800.5 to the left, improve=0.3928571, (0 missing)
## Housing.Own splits as LR, improve=0.3666667, (0 missing)
## Job.UnskilledResident splits as LR, improve=0.1047619, (0 missing)
## Surrogate splits:
## Telephone splits as LR, agree=0.727, adj=0.4, (0 split)
## CreditHistory.PaidDuly splits as LR, agree=0.727, adj=0.4, (0 split)
## Personal.Female.NotSingle splits as LR, agree=0.727, adj=0.4, (0 split)
## Personal.Male.Single splits as RL, agree=0.727, adj=0.4, (0 split)
## NumberExistingCredits < 1.5 to the right, agree=0.682, adj=0.3, (0 split)
##
## Node number 2499: 9 observations
## predicted class=Good expected loss=0 P(node) =0.01125
## class counts: 0 9
## probabilities: 0.000 1.000
##
## Node number 2502: 44 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.4318182 P(node) =0.055
## class counts: 19 25
## probabilities: 0.432 0.568
## left son=5004 (29 obs) right son=5005 (15 obs)
## Primary splits:
## CreditHistory.Critical splits as LR, improve=2.446082, (0 missing)
## NumberExistingCredits < 1.5 to the left, improve=2.139929, (0 missing)
## Amount < 2168.5 to the left, improve=1.958430, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.355615, (0 missing)
## Age < 29.5 to the right, improve=1.241484, (0 missing)
## Surrogate splits:
## CreditHistory.PaidDuly splits as RL, agree=0.727, adj=0.200, (0 split)
## Amount < 3408.5 to the left, agree=0.682, adj=0.067, (0 split)
## NumberExistingCredits < 1.5 to the left, agree=0.682, adj=0.067, (0 split)
## Property.Insurance splits as LR, agree=0.682, adj=0.067, (0 split)
##
## Node number 2503: 35 observations
## predicted class=Good expected loss=0.2 P(node) =0.04375
## class counts: 7 28
## probabilities: 0.200 0.800
##
## Node number 4994: 13 observations
## predicted class=Bad expected loss=0.2307692 P(node) =0.01625
## class counts: 10 3
## probabilities: 0.769 0.231
##
## Node number 4995: 40 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.475 P(node) =0.05
## class counts: 19 21
## probabilities: 0.475 0.525
## left son=9990 (25 obs) right son=9991 (15 obs)
## Primary splits:
## Property.CarOther splits as LR, improve=2.083333, (0 missing)
## Age < 30.5 to the right, improve=1.763333, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=1.543407, (0 missing)
## Amount < 3575.5 to the left, improve=1.408333, (0 missing)
## Property.Insurance splits as RL, improve=1.213736, (0 missing)
## Surrogate splits:
## Property.Insurance splits as RL, agree=0.725, adj=0.267, (0 split)
## Amount < 3415 to the left, agree=0.675, adj=0.133, (0 split)
## Age < 21.5 to the right, agree=0.675, adj=0.133, (0 split)
## NumberExistingCredits < 2.5 to the left, agree=0.675, adj=0.133, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.675, adj=0.133, (0 split)
##
## Node number 4996: 10 observations
## predicted class=Bad expected loss=0.3 P(node) =0.0125
## class counts: 7 3
## probabilities: 0.700 0.300
##
## Node number 4997: 12 observations
## predicted class=Good expected loss=0.3333333 P(node) =0.015
## class counts: 4 8
## probabilities: 0.333 0.667
##
## Node number 5004: 29 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.4482759 P(node) =0.03625
## class counts: 16 13
## probabilities: 0.552 0.448
## left son=10008 (13 obs) right son=10009 (16 obs)
## Primary splits:
## Age < 33.5 to the right, improve=2.2294430, (0 missing)
## ResidenceDuration < 3.5 to the left, improve=1.2539180, (0 missing)
## Job.SkilledEmployee splits as LR, improve=1.0923020, (0 missing)
## Amount < 2168.5 to the left, improve=0.9313660, (0 missing)
## Duration < 20.5 to the right, improve=0.8210181, (0 missing)
## Surrogate splits:
## ResidenceDuration < 2.5 to the right, agree=0.724, adj=0.385, (0 split)
## Job.SkilledEmployee splits as LR, agree=0.724, adj=0.385, (0 split)
## Amount < 2965.5 to the right, agree=0.690, adj=0.308, (0 split)
## Purpose.Furniture.Equipment splits as RL, agree=0.690, adj=0.308, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.690, adj=0.308, (0 split)
##
## Node number 5005: 15 observations
## predicted class=Good expected loss=0.2 P(node) =0.01875
## class counts: 3 12
## probabilities: 0.200 0.800
##
## Node number 9990: 25 observations, complexity param=0.008333333
## predicted class=Bad expected loss=0.4 P(node) =0.03125
## class counts: 15 10
## probabilities: 0.600 0.400
## left son=19980 (17 obs) right son=19981 (8 obs)
## Primary splits:
## Amount < 3106 to the right, improve=1.1911760, (0 missing)
## Age < 25 to the right, improve=1.1911760, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=1.1911760, (0 missing)
## EmploymentDuration.lt.1 splits as RL, improve=0.8888889, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=0.5714286, (0 missing)
## Surrogate splits:
## Property.RealEstate splits as LR, agree=0.80, adj=0.375, (0 split)
## Age < 60.5 to the left, agree=0.76, adj=0.250, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.76, adj=0.250, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.72, adj=0.125, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.72, adj=0.125, (0 split)
##
## Node number 9991: 15 observations
## predicted class=Good expected loss=0.2666667 P(node) =0.01875
## class counts: 4 11
## probabilities: 0.267 0.733
##
## Node number 10008: 13 observations
## predicted class=Bad expected loss=0.2307692 P(node) =0.01625
## class counts: 10 3
## probabilities: 0.769 0.231
##
## Node number 10009: 16 observations
## predicted class=Good expected loss=0.375 P(node) =0.02
## class counts: 6 10
## probabilities: 0.375 0.625
##
## Node number 19980: 17 observations
## predicted class=Bad expected loss=0.2941176 P(node) =0.02125
## class counts: 12 5
## probabilities: 0.706 0.294
##
## Node number 19981: 8 observations
## predicted class=Good expected loss=0.375 P(node) =0.01
## class counts: 3 5
## probabilities: 0.375 0.625
##################################
# Identifying the most predictive variables
##################################
001.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 25.5883818
## Amount SavingsAccountBonds.lt.100
## 23.4460098 13.9266136
## Age CheckingAccountStatus.0.to.200
## 12.1698806 10.9886136
## CreditHistory.Critical SavingsAccountBonds.Unknown
## 8.7623987 6.6785937
## SavingsAccountBonds.100.to.500 Property.RealEstate
## 6.4489710 6.0175175
## OtherDebtorsGuarantors.Guarantor InstallmentRatePercentage
## 4.9643144 4.2482033
## Purpose.NewCar OtherDebtorsGuarantors.None
## 4.1140587 3.7278926
## EmploymentDuration.4.to.7 Property.CarOther
## 3.6343707 3.6227029
## Property.Insurance OtherInstallmentPlans.None
## 3.5239267 3.4229365
## EmploymentDuration.1.to.4 Purpose.UsedCar
## 3.2604912 3.2526862
## Purpose.Business SavingsAccountBonds.500.to.1000
## 2.8973105 2.7314685
## EmploymentDuration.gt.7 CheckingAccountStatus.gt.200
## 2.5011765 2.4870121
## NumberExistingCredits OtherInstallmentPlans.Bank
## 2.4018956 2.3750988
## CreditHistory.PaidDuly Job.SkilledEmployee
## 2.1171386 1.9878199
## Purpose.Furniture.Equipment Purpose.Radio.Television
## 1.6561767 1.5369792
## Telephone Job.UnskilledResident
## 1.4139982 1.2144198
## ResidenceDuration Job.Management.SelfEmp.HighlyQualified
## 0.8574781 0.7202881
## CreditHistory.Delay Personal.Female.NotSingle
## 0.6420244 0.5866667
## Personal.Male.Single Personal.Male.Divorced.Seperated
## 0.5866667 0.3548387
## Housing.Rent
## 0.2993096
##################################
# Plotting the RPART model structure
##################################
fancyRpartPlot(rpartFit.Apparent.CP.001.Pruned, caption = NULL)
##################################
# Evaluating the RPART model
# on the train set
##################################
$PredClass.CP.001 <- predict(rpartFit.Apparent.CP.001.Pruned,
MA_Train.Evaluatednewdata = MA_Train,
type = "class")
$PredCorrect.CP.001 <- ifelse(MA_Train.Evaluated$Class==MA_Train.Evaluated$PredClass.CP.001,1,0)
MA_Train.Evaluated
##################################
# Computing for the
# apparent model performance
##################################
.001 <- mean(MA_Train.Evaluated$PredCorrect.CP.001)) (Train.CP
## [1] 0.84
##################################
# Evaluating the RPART model
# on the test set
##################################
$PredClass.CP.001 <- predict(rpartFit.Apparent.CP.001.Pruned,
MA_Test.Evaluatednewdata = MA_Test,
type = "class")
$PredCorrect.CP.001 <- ifelse(MA_Test.Evaluated$Class==MA_Test.Evaluated$PredClass.CP.001,1,0)
MA_Test.Evaluated
##################################
# Computing for the
# external validation model performance
##################################
.001 <- mean(MA_Test.Evaluated$PredCorrect.CP.001)) (Test.CP
## [1] 0.73
##################################
# Formulating the RPART model
# using a complexity parameter setting
# equal to 0.005
#################################
.005 = rpart(Class ~ .,
rpartFit.Apparent.CPdata = MA_Train,
control = rpart.control(cp = 0.005))
printcp(rpartFit.Apparent.CP.005)
##
## Classification tree:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.005))
##
## Variables actually used in tree construction:
## [1] Age Amount
## [3] CheckingAccountStatus.0.to.200 CheckingAccountStatus.gt.200
## [5] CheckingAccountStatus.none CreditHistory.Critical
## [7] Duration EmploymentDuration.4.to.7
## [9] EmploymentDuration.gt.7 InstallmentRatePercentage
## [11] OtherDebtorsGuarantors.Guarantor OtherInstallmentPlans.None
## [13] Property.CarOther Purpose.Business
## [15] Purpose.NewCar Purpose.UsedCar
## [17] SavingsAccountBonds.lt.100
##
## Root node error: 240/800 = 0.3
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.0291667 0 1.00000 1.00000 0.054006
## 2 0.0270833 5 0.84167 1.05833 0.054860
## 3 0.0208333 7 0.78750 1.04167 0.054625
## 4 0.0166667 8 0.76667 1.00000 0.054006
## 5 0.0141667 9 0.75000 1.02083 0.054322
## 6 0.0097222 15 0.65833 1.03750 0.054566
## 7 0.0083333 21 0.60000 0.98750 0.053811
## 8 0.0052083 24 0.57500 0.99167 0.053877
## 9 0.0050000 28 0.55417 1.00417 0.054070
##################################
# Pruning the model
##################################
005.Pruned <- prune(rpartFit.Apparent.CP.005, cp = 0.005)
rpartFit.Apparent.CP.
005.Pruned.Summary <- summary(rpartFit.Apparent.CP.005.Pruned) rpartFit.Apparent.CP.
## Call:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.005))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.029166667 0 1.0000000 1.0000000 0.05400617
## 2 0.027083333 5 0.8416667 1.0583333 0.05486014
## 3 0.020833333 7 0.7875000 1.0416667 0.05462546
## 4 0.016666667 8 0.7666667 1.0000000 0.05400617
## 5 0.014166667 9 0.7500000 1.0208333 0.05432169
## 6 0.009722222 15 0.6583333 1.0375000 0.05456564
## 7 0.008333333 21 0.6000000 0.9875000 0.05381113
## 8 0.005208333 24 0.5750000 0.9916667 0.05387663
## 9 0.005000000 28 0.5541667 1.0041667 0.05407023
##
## Variable importance
## CheckingAccountStatus.none Duration
## 19 13
## Amount SavingsAccountBonds.lt.100
## 10 7
## Age CheckingAccountStatus.0.to.200
## 6 6
## SavingsAccountBonds.Unknown SavingsAccountBonds.100.to.500
## 3 3
## CreditHistory.Critical OtherDebtorsGuarantors.Guarantor
## 3 3
## InstallmentRatePercentage Purpose.NewCar
## 2 2
## OtherDebtorsGuarantors.None OtherInstallmentPlans.None
## 2 2
## Purpose.UsedCar EmploymentDuration.4.to.7
## 2 2
## Purpose.Business SavingsAccountBonds.500.to.1000
## 1 1
## CheckingAccountStatus.gt.200 OtherInstallmentPlans.Bank
## 1 1
## Property.CarOther EmploymentDuration.gt.7
## 1 1
## Job.SkilledEmployee EmploymentDuration.1.to.4
## 1 1
## Purpose.Furniture.Equipment Purpose.Radio.Television
## 1 1
## CreditHistory.PaidDuly NumberExistingCredits
## 1 1
## Property.Insurance
## 1
##
## Node number 1: 800 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.3 P(node) =1
## class counts: 240 560
## probabilities: 0.300 0.700
## left son=2 (484 obs) right son=3 (316 obs)
## Primary splits:
## CheckingAccountStatus.none splits as LR, improve=36.169470, (0 missing)
## CreditHistory.Critical splits as LR, improve=14.535700, (0 missing)
## Amount < 10918 to the right, improve=13.690600, (0 missing)
## Duration < 26.5 to the right, improve=10.957470, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve= 9.839849, (0 missing)
## Surrogate splits:
## CheckingAccountStatus.0.to.200 splits as RL, agree=0.660, adj=0.139, (0 split)
## CreditHistory.Critical splits as LR, agree=0.627, adj=0.057, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.626, adj=0.054, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.624, adj=0.047, (0 split)
## SavingsAccountBonds.lt.100 splits as RL, agree=0.623, adj=0.044, (0 split)
##
## Node number 2: 484 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4214876 P(node) =0.605
## class counts: 204 280
## probabilities: 0.421 0.579
## left son=4 (404 obs) right son=5 (80 obs)
## Primary splits:
## Duration < 11.5 to the right, improve=9.403355, (0 missing)
## Amount < 10841.5 to the right, improve=9.011413, (0 missing)
## CreditHistory.Critical splits as LR, improve=8.753453, (0 missing)
## Property.RealEstate splits as LR, improve=5.148565, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.821222, (0 missing)
## Surrogate splits:
## Age < 66.5 to the left, agree=0.845, adj=0.063, (0 split)
## Amount < 527.5 to the right, agree=0.841, adj=0.038, (0 split)
##
## Node number 3: 316 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.1139241 P(node) =0.395
## class counts: 36 280
## probabilities: 0.114 0.886
## left son=6 (49 obs) right son=7 (267 obs)
## Primary splits:
## OtherInstallmentPlans.None splits as LR, improve=3.422937, (0 missing)
## OtherInstallmentPlans.Bank splits as RL, improve=2.474148, (0 missing)
## Purpose.Business splits as RL, improve=2.463815, (0 missing)
## Amount < 3891 to the right, improve=2.187268, (0 missing)
## Age < 33.5 to the left, improve=2.068899, (0 missing)
## Surrogate splits:
## OtherInstallmentPlans.Bank splits as RL, agree=0.953, adj=0.694, (0 split)
## Duration < 45 to the right, agree=0.854, adj=0.061, (0 split)
##
## Node number 4: 404 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4653465 P(node) =0.505
## class counts: 188 216
## probabilities: 0.465 0.535
## left son=8 (20 obs) right son=9 (384 obs)
## Primary splits:
## Amount < 10841.5 to the right, improve=6.226578, (0 missing)
## Duration < 47.5 to the right, improve=5.986296, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.939718, (0 missing)
## CreditHistory.Critical splits as LR, improve=4.354026, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=4.296602, (0 missing)
##
## Node number 5: 80 observations
## predicted class=Good expected loss=0.2 P(node) =0.1
## class counts: 16 64
## probabilities: 0.200 0.800
##
## Node number 6: 49 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.2857143 P(node) =0.06125
## class counts: 14 35
## probabilities: 0.286 0.714
## left son=12 (34 obs) right son=13 (15 obs)
## Primary splits:
## EmploymentDuration.gt.7 splits as LR, improve=2.074510, (0 missing)
## Age < 43.5 to the left, improve=2.051282, (0 missing)
## Purpose.Business splits as RL, improve=1.913876, (0 missing)
## Purpose.Radio.Television splits as LR, improve=1.800000, (0 missing)
## EmploymentDuration.1.to.4 splits as RL, improve=1.779412, (0 missing)
## Surrogate splits:
## Age < 41.5 to the left, agree=0.837, adj=0.467, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.776, adj=0.267, (0 split)
## Property.Insurance splits as LR, agree=0.755, adj=0.200, (0 split)
## NumberExistingCredits < 2.5 to the left, agree=0.735, adj=0.133, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.735, adj=0.133, (0 split)
##
## Node number 7: 267 observations
## predicted class=Good expected loss=0.082397 P(node) =0.33375
## class counts: 22 245
## probabilities: 0.082 0.918
##
## Node number 8: 20 observations
## predicted class=Bad expected loss=0.15 P(node) =0.025
## class counts: 17 3
## probabilities: 0.850 0.150
##
## Node number 9: 384 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4453125 P(node) =0.48
## class counts: 171 213
## probabilities: 0.445 0.555
## left son=18 (88 obs) right son=19 (296 obs)
## Primary splits:
## Purpose.NewCar splits as RL, improve=4.114059, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.110862, (0 missing)
## Amount < 1381.5 to the left, improve=4.104405, (0 missing)
## Duration < 47.5 to the right, improve=4.095170, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.024666, (0 missing)
##
## Node number 12: 34 observations, complexity param=0.005208333
## predicted class=Good expected loss=0.3823529 P(node) =0.0425
## class counts: 13 21
## probabilities: 0.382 0.618
## left son=24 (24 obs) right son=25 (10 obs)
## Primary splits:
## EmploymentDuration.4.to.7 splits as LR, improve=2.2588240, (0 missing)
## Purpose.Business splits as RL, improve=1.9788240, (0 missing)
## CreditHistory.Critical splits as RL, improve=1.9424210, (0 missing)
## Amount < 2190.5 to the right, improve=1.7254900, (0 missing)
## NumberExistingCredits < 1.5 to the right, improve=0.8366013, (0 missing)
## Surrogate splits:
## EmploymentDuration.1.to.4 splits as RL, agree=0.794, adj=0.3, (0 split)
## Amount < 8797.5 to the left, agree=0.735, adj=0.1, (0 split)
## Age < 24.5 to the right, agree=0.735, adj=0.1, (0 split)
## Property.CarOther splits as LR, agree=0.735, adj=0.1, (0 split)
##
## Node number 13: 15 observations
## predicted class=Good expected loss=0.06666667 P(node) =0.01875
## class counts: 1 14
## probabilities: 0.067 0.933
##
## Node number 18: 88 observations, complexity param=0.02708333
## predicted class=Bad expected loss=0.4204545 P(node) =0.11
## class counts: 51 37
## probabilities: 0.580 0.420
## left son=36 (33 obs) right son=37 (55 obs)
## Primary splits:
## Amount < 1392 to the left, improve=4.583333, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.207792, (0 missing)
## Age < 29.5 to the left, improve=2.506499, (0 missing)
## Duration < 22.5 to the right, improve=2.480381, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.376720, (0 missing)
## Surrogate splits:
## Duration < 13 to the left, agree=0.693, adj=0.182, (0 split)
## Age < 45.5 to the right, agree=0.670, adj=0.121, (0 split)
## Property.RealEstate splits as RL, agree=0.659, adj=0.091, (0 split)
## Job.UnskilledResident splits as RL, agree=0.636, adj=0.030, (0 split)
##
## Node number 19: 296 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4054054 P(node) =0.37
## class counts: 120 176
## probabilities: 0.405 0.595
## left son=38 (26 obs) right son=39 (270 obs)
## Primary splits:
## Duration < 46.5 to the right, improve=4.692446, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=3.771026, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.897053, (0 missing)
## Amount < 4038.5 to the right, improve=2.660474, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.495429, (0 missing)
##
## Node number 24: 24 observations, complexity param=0.005208333
## predicted class=Bad expected loss=0.5 P(node) =0.03
## class counts: 12 12
## probabilities: 0.500 0.500
## left son=48 (7 obs) right son=49 (17 obs)
## Primary splits:
## Purpose.Business splits as RL, improve=2.521008, (0 missing)
## Duration < 16.5 to the right, improve=1.500000, (0 missing)
## Amount < 2190.5 to the right, improve=1.500000, (0 missing)
## Telephone splits as LR, improve=1.500000, (0 missing)
## Age < 35.5 to the right, improve=0.907563, (0 missing)
## Surrogate splits:
## Job.Management.SelfEmp.HighlyQualified splits as RL, agree=0.792, adj=0.286, (0 split)
## Duration < 42 to the right, agree=0.750, adj=0.143, (0 split)
## Age < 35.5 to the right, agree=0.750, adj=0.143, (0 split)
## SavingsAccountBonds.500.to.1000 splits as RL, agree=0.750, adj=0.143, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.750, adj=0.143, (0 split)
##
## Node number 25: 10 observations
## predicted class=Good expected loss=0.1 P(node) =0.0125
## class counts: 1 9
## probabilities: 0.100 0.900
##
## Node number 36: 33 observations
## predicted class=Bad expected loss=0.2121212 P(node) =0.04125
## class counts: 26 7
## probabilities: 0.788 0.212
##
## Node number 37: 55 observations, complexity param=0.02708333
## predicted class=Good expected loss=0.4545455 P(node) =0.06875
## class counts: 25 30
## probabilities: 0.455 0.545
## left son=74 (26 obs) right son=75 (29 obs)
## Primary splits:
## Duration < 22.5 to the right, improve=3.917290, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.563050, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.629870, (0 missing)
## Age < 29.5 to the left, improve=1.878706, (0 missing)
## Amount < 3904.5 to the right, improve=1.838554, (0 missing)
## Surrogate splits:
## Amount < 2674.5 to the right, agree=0.655, adj=0.269, (0 split)
## InstallmentRatePercentage < 3.5 to the right, agree=0.655, adj=0.269, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.618, adj=0.192, (0 split)
## EmploymentDuration.4.to.7 splits as RL, agree=0.618, adj=0.192, (0 split)
## Age < 27.5 to the left, agree=0.600, adj=0.154, (0 split)
##
## Node number 38: 26 observations, complexity param=0.01666667
## predicted class=Bad expected loss=0.3076923 P(node) =0.0325
## class counts: 18 8
## probabilities: 0.692 0.308
## left son=76 (18 obs) right son=77 (8 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=4.521368, (0 missing)
## Personal.Male.Single splits as LR, improve=2.188034, (0 missing)
## ResidenceDuration < 3.5 to the right, improve=2.155711, (0 missing)
## EmploymentDuration.gt.7 splits as RL, improve=1.813765, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=1.230769, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.Unknown splits as LR, agree=0.885, adj=0.625, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.808, adj=0.375, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.808, adj=0.375, (0 split)
## Job.SkilledEmployee splits as RL, agree=0.769, adj=0.250, (0 split)
## Amount < 4143.5 to the right, agree=0.731, adj=0.125, (0 split)
##
## Node number 39: 270 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.3777778 P(node) =0.3375
## class counts: 102 168
## probabilities: 0.378 0.622
## left son=78 (249 obs) right son=79 (21 obs)
## Primary splits:
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.964314, (0 missing)
## Purpose.UsedCar splits as LR, improve=3.008333, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.748558, (0 missing)
## OtherDebtorsGuarantors.None splits as RL, improve=2.377253, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.202868, (0 missing)
## Surrogate splits:
## OtherDebtorsGuarantors.None splits as RL, agree=0.963, adj=0.524, (0 split)
##
## Node number 48: 7 observations
## predicted class=Bad expected loss=0.1428571 P(node) =0.00875
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 49: 17 observations
## predicted class=Good expected loss=0.3529412 P(node) =0.02125
## class counts: 6 11
## probabilities: 0.353 0.647
##
## Node number 74: 26 observations, complexity param=0.02083333
## predicted class=Bad expected loss=0.3461538 P(node) =0.0325
## class counts: 17 9
## probabilities: 0.654 0.346
## left son=148 (17 obs) right son=149 (9 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=5.128708, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.769231, (0 missing)
## Age < 29 to the left, improve=2.484382, (0 missing)
## Telephone splits as RL, improve=1.207139, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.207139, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.885, adj=0.667, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.769, adj=0.333, (0 split)
## Duration < 47.5 to the left, agree=0.731, adj=0.222, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.731, adj=0.222, (0 split)
## Amount < 5897 to the left, agree=0.692, adj=0.111, (0 split)
##
## Node number 75: 29 observations
## predicted class=Good expected loss=0.2758621 P(node) =0.03625
## class counts: 8 21
## probabilities: 0.276 0.724
##
## Node number 76: 18 observations
## predicted class=Bad expected loss=0.1111111 P(node) =0.0225
## class counts: 16 2
## probabilities: 0.889 0.111
##
## Node number 77: 8 observations
## predicted class=Good expected loss=0.25 P(node) =0.01
## class counts: 2 6
## probabilities: 0.250 0.750
##
## Node number 78: 249 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4056225 P(node) =0.31125
## class counts: 101 148
## probabilities: 0.406 0.594
## left son=156 (221 obs) right son=157 (28 obs)
## Primary splits:
## Purpose.UsedCar splits as LR, improve=3.252686, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.954286, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=2.289768, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.211870, (0 missing)
## Amount < 1367.5 to the left, improve=2.089978, (0 missing)
## Surrogate splits:
## Amount < 8877 to the left, agree=0.892, adj=0.036, (0 split)
##
## Node number 79: 21 observations
## predicted class=Good expected loss=0.04761905 P(node) =0.02625
## class counts: 1 20
## probabilities: 0.048 0.952
##
## Node number 148: 17 observations
## predicted class=Bad expected loss=0.1176471 P(node) =0.02125
## class counts: 15 2
## probabilities: 0.882 0.118
##
## Node number 149: 9 observations
## predicted class=Good expected loss=0.2222222 P(node) =0.01125
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 156: 221 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4343891 P(node) =0.27625
## class counts: 96 125
## probabilities: 0.434 0.566
## left son=312 (192 obs) right son=313 (29 obs)
## Primary splits:
## CheckingAccountStatus.gt.200 splits as LR, improve=2.487012, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.102047, (0 missing)
## Duration < 25.5 to the right, improve=1.712852, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.665913, (0 missing)
## Amount < 4038.5 to the right, improve=1.635174, (0 missing)
##
## Node number 157: 28 observations
## predicted class=Good expected loss=0.1785714 P(node) =0.035
## class counts: 5 23
## probabilities: 0.179 0.821
##
## Node number 312: 192 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4635417 P(node) =0.24
## class counts: 89 103
## probabilities: 0.464 0.536
## left son=624 (104 obs) right son=625 (88 obs)
## Primary splits:
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.547276, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.533350, (0 missing)
## Duration < 28.5 to the right, improve=2.331440, (0 missing)
## Amount < 4038.5 to the right, improve=1.903117, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.898893, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.lt.100 splits as RL, agree=0.646, adj=0.227, (0 split)
## Age < 31.5 to the left, agree=0.625, adj=0.182, (0 split)
## Purpose.Business splits as LR, agree=0.609, adj=0.148, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.604, adj=0.136, (0 split)
## Purpose.Furniture.Equipment splits as RL, agree=0.589, adj=0.102, (0 split)
##
## Node number 313: 29 observations
## predicted class=Good expected loss=0.2413793 P(node) =0.03625
## class counts: 7 22
## probabilities: 0.241 0.759
##
## Node number 624: 104 observations, complexity param=0.01416667
## predicted class=Bad expected loss=0.4615385 P(node) =0.13
## class counts: 56 48
## probabilities: 0.538 0.462
## left son=1248 (73 obs) right son=1249 (31 obs)
## Primary splits:
## Duration < 16.5 to the right, improve=2.978211, (0 missing)
## Amount < 4276.5 to the right, improve=1.692308, (0 missing)
## CreditHistory.Critical splits as LR, improve=1.398866, (0 missing)
## Age < 54 to the left, improve=1.258265, (0 missing)
## Purpose.Business splits as LR, improve=1.258265, (0 missing)
## Surrogate splits:
## Amount < 1119.5 to the right, agree=0.827, adj=0.419, (0 split)
##
## Node number 625: 88 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.375 P(node) =0.11
## class counts: 33 55
## probabilities: 0.375 0.625
## left son=1250 (9 obs) right son=1251 (79 obs)
## Primary splits:
## Age < 48.5 to the right, improve=3.252813, (0 missing)
## Job.Management.SelfEmp.HighlyQualified splits as RL, improve=2.475000, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.964286, (0 missing)
## Amount < 2168.5 to the left, improve=1.424663, (0 missing)
## Personal.Male.Single splits as LR, improve=1.281056, (0 missing)
##
## Node number 1248: 73 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.3835616 P(node) =0.09125
## class counts: 45 28
## probabilities: 0.616 0.384
## left son=2496 (20 obs) right son=2497 (53 obs)
## Primary splits:
## Amount < 2178.5 to the left, improve=1.8563970, (0 missing)
## Duration < 31.5 to the right, improve=1.5753730, (0 missing)
## Age < 42.5 to the left, improve=1.1462310, (0 missing)
## Purpose.Business splits as LR, improve=1.0474710, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=0.9428169, (0 missing)
##
## Node number 1249: 31 observations, complexity param=0.008333333
## predicted class=Good expected loss=0.3548387 P(node) =0.03875
## class counts: 11 20
## probabilities: 0.355 0.645
## left son=2498 (22 obs) right son=2499 (9 obs)
## Primary splits:
## InstallmentRatePercentage < 3.5 to the right, improve=3.1935480, (0 missing)
## Purpose.Furniture.Equipment splits as LR, improve=2.8865310, (0 missing)
## Amount < 938.5 to the left, improve=1.7745010, (0 missing)
## Duration < 12.5 to the left, improve=1.1392010, (0 missing)
## Housing.Own splits as LR, improve=0.8251273, (0 missing)
## Surrogate splits:
## Amount < 1559 to the left, agree=0.774, adj=0.222, (0 split)
## Purpose.Furniture.Equipment splits as LR, agree=0.774, adj=0.222, (0 split)
## OtherDebtorsGuarantors.None splits as RL, agree=0.774, adj=0.222, (0 split)
## CreditHistory.Delay splits as LR, agree=0.742, adj=0.111, (0 split)
## Personal.Male.Divorced.Seperated splits as LR, agree=0.742, adj=0.111, (0 split)
##
## Node number 1250: 9 observations
## predicted class=Bad expected loss=0.2222222 P(node) =0.01125
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 1251: 79 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.3291139 P(node) =0.09875
## class counts: 26 53
## probabilities: 0.329 0.671
## left son=2502 (44 obs) right son=2503 (35 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=2.095167, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.937309, (0 missing)
## Personal.Male.Single splits as LR, improve=1.774107, (0 missing)
## Amount < 2168.5 to the left, improve=1.531237, (0 missing)
## EmploymentDuration.lt.1 splits as RL, improve=1.045725, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.734, adj=0.400, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.709, adj=0.343, (0 split)
## CreditHistory.PaidDuly splits as LR, agree=0.646, adj=0.200, (0 split)
## Housing.Rent splits as LR, agree=0.620, adj=0.143, (0 split)
## Age < 27.5 to the right, agree=0.608, adj=0.114, (0 split)
##
## Node number 2496: 20 observations
## predicted class=Bad expected loss=0.2 P(node) =0.025
## class counts: 16 4
## probabilities: 0.800 0.200
##
## Node number 2497: 53 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.4528302 P(node) =0.06625
## class counts: 29 24
## probabilities: 0.547 0.453
## left son=4994 (13 obs) right son=4995 (40 obs)
## Primary splits:
## Duration < 31.5 to the right, improve=1.6987660, (0 missing)
## Amount < 2452 to the right, improve=1.1026600, (0 missing)
## Age < 22.5 to the right, improve=1.1026600, (0 missing)
## Property.CarOther splits as LR, improve=0.9784367, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=0.9177457, (0 missing)
## Surrogate splits:
## CreditHistory.Critical splits as RL, agree=0.830, adj=0.308, (0 split)
## OtherDebtorsGuarantors.None splits as LR, agree=0.774, adj=0.077, (0 split)
##
## Node number 2498: 22 observations, complexity param=0.008333333
## predicted class=Bad expected loss=0.5 P(node) =0.0275
## class counts: 11 11
## probabilities: 0.500 0.500
## left son=4996 (10 obs) right son=4997 (12 obs)
## Primary splits:
## Age < 27.5 to the right, improve=1.4666670, (0 missing)
## Personal.Female.NotSingle splits as LR, improve=1.4666670, (0 missing)
## Amount < 800.5 to the left, improve=0.3928571, (0 missing)
## Housing.Own splits as LR, improve=0.3666667, (0 missing)
## Job.UnskilledResident splits as LR, improve=0.1047619, (0 missing)
## Surrogate splits:
## Telephone splits as LR, agree=0.727, adj=0.4, (0 split)
## CreditHistory.PaidDuly splits as LR, agree=0.727, adj=0.4, (0 split)
## Personal.Female.NotSingle splits as LR, agree=0.727, adj=0.4, (0 split)
## Personal.Male.Single splits as RL, agree=0.727, adj=0.4, (0 split)
## NumberExistingCredits < 1.5 to the right, agree=0.682, adj=0.3, (0 split)
##
## Node number 2499: 9 observations
## predicted class=Good expected loss=0 P(node) =0.01125
## class counts: 0 9
## probabilities: 0.000 1.000
##
## Node number 2502: 44 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.4318182 P(node) =0.055
## class counts: 19 25
## probabilities: 0.432 0.568
## left son=5004 (29 obs) right son=5005 (15 obs)
## Primary splits:
## CreditHistory.Critical splits as LR, improve=2.446082, (0 missing)
## NumberExistingCredits < 1.5 to the left, improve=2.139929, (0 missing)
## Amount < 2168.5 to the left, improve=1.958430, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.355615, (0 missing)
## Age < 29.5 to the right, improve=1.241484, (0 missing)
## Surrogate splits:
## CreditHistory.PaidDuly splits as RL, agree=0.727, adj=0.200, (0 split)
## Amount < 3408.5 to the left, agree=0.682, adj=0.067, (0 split)
## NumberExistingCredits < 1.5 to the left, agree=0.682, adj=0.067, (0 split)
## Property.Insurance splits as LR, agree=0.682, adj=0.067, (0 split)
##
## Node number 2503: 35 observations
## predicted class=Good expected loss=0.2 P(node) =0.04375
## class counts: 7 28
## probabilities: 0.200 0.800
##
## Node number 4994: 13 observations
## predicted class=Bad expected loss=0.2307692 P(node) =0.01625
## class counts: 10 3
## probabilities: 0.769 0.231
##
## Node number 4995: 40 observations, complexity param=0.009722222
## predicted class=Good expected loss=0.475 P(node) =0.05
## class counts: 19 21
## probabilities: 0.475 0.525
## left son=9990 (25 obs) right son=9991 (15 obs)
## Primary splits:
## Property.CarOther splits as LR, improve=2.083333, (0 missing)
## Age < 30.5 to the right, improve=1.763333, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=1.543407, (0 missing)
## Amount < 3575.5 to the left, improve=1.408333, (0 missing)
## Property.Insurance splits as RL, improve=1.213736, (0 missing)
## Surrogate splits:
## Property.Insurance splits as RL, agree=0.725, adj=0.267, (0 split)
## Amount < 3415 to the left, agree=0.675, adj=0.133, (0 split)
## Age < 21.5 to the right, agree=0.675, adj=0.133, (0 split)
## NumberExistingCredits < 2.5 to the left, agree=0.675, adj=0.133, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.675, adj=0.133, (0 split)
##
## Node number 4996: 10 observations
## predicted class=Bad expected loss=0.3 P(node) =0.0125
## class counts: 7 3
## probabilities: 0.700 0.300
##
## Node number 4997: 12 observations
## predicted class=Good expected loss=0.3333333 P(node) =0.015
## class counts: 4 8
## probabilities: 0.333 0.667
##
## Node number 5004: 29 observations, complexity param=0.009722222
## predicted class=Bad expected loss=0.4482759 P(node) =0.03625
## class counts: 16 13
## probabilities: 0.552 0.448
## left son=10008 (13 obs) right son=10009 (16 obs)
## Primary splits:
## Age < 33.5 to the right, improve=2.2294430, (0 missing)
## ResidenceDuration < 3.5 to the left, improve=1.2539180, (0 missing)
## Job.SkilledEmployee splits as LR, improve=1.0923020, (0 missing)
## Amount < 2168.5 to the left, improve=0.9313660, (0 missing)
## Duration < 20.5 to the right, improve=0.8210181, (0 missing)
## Surrogate splits:
## ResidenceDuration < 2.5 to the right, agree=0.724, adj=0.385, (0 split)
## Job.SkilledEmployee splits as LR, agree=0.724, adj=0.385, (0 split)
## Amount < 2965.5 to the right, agree=0.690, adj=0.308, (0 split)
## Purpose.Furniture.Equipment splits as RL, agree=0.690, adj=0.308, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.690, adj=0.308, (0 split)
##
## Node number 5005: 15 observations
## predicted class=Good expected loss=0.2 P(node) =0.01875
## class counts: 3 12
## probabilities: 0.200 0.800
##
## Node number 9990: 25 observations, complexity param=0.008333333
## predicted class=Bad expected loss=0.4 P(node) =0.03125
## class counts: 15 10
## probabilities: 0.600 0.400
## left son=19980 (17 obs) right son=19981 (8 obs)
## Primary splits:
## Amount < 3106 to the right, improve=1.1911760, (0 missing)
## Age < 25 to the right, improve=1.1911760, (0 missing)
## OtherInstallmentPlans.None splits as RL, improve=1.1911760, (0 missing)
## EmploymentDuration.lt.1 splits as RL, improve=0.8888889, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=0.5714286, (0 missing)
## Surrogate splits:
## Property.RealEstate splits as LR, agree=0.80, adj=0.375, (0 split)
## Age < 60.5 to the left, agree=0.76, adj=0.250, (0 split)
## Purpose.Radio.Television splits as LR, agree=0.76, adj=0.250, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.72, adj=0.125, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.72, adj=0.125, (0 split)
##
## Node number 9991: 15 observations
## predicted class=Good expected loss=0.2666667 P(node) =0.01875
## class counts: 4 11
## probabilities: 0.267 0.733
##
## Node number 10008: 13 observations
## predicted class=Bad expected loss=0.2307692 P(node) =0.01625
## class counts: 10 3
## probabilities: 0.769 0.231
##
## Node number 10009: 16 observations
## predicted class=Good expected loss=0.375 P(node) =0.02
## class counts: 6 10
## probabilities: 0.375 0.625
##
## Node number 19980: 17 observations
## predicted class=Bad expected loss=0.2941176 P(node) =0.02125
## class counts: 12 5
## probabilities: 0.706 0.294
##
## Node number 19981: 8 observations
## predicted class=Good expected loss=0.375 P(node) =0.01
## class counts: 3 5
## probabilities: 0.375 0.625
##################################
# Identifying the most predictive variables
##################################
005.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 25.2328263
## Amount SavingsAccountBonds.lt.100
## 19.8272797 13.9266136
## Age CheckingAccountStatus.0.to.200
## 11.5271393 10.9886136
## SavingsAccountBonds.Unknown SavingsAccountBonds.100.to.500
## 6.6785937 6.4489710
## CreditHistory.Critical OtherDebtorsGuarantors.Guarantor
## 5.0290653 4.9643144
## InstallmentRatePercentage Purpose.NewCar
## 4.2482033 4.1140587
## OtherDebtorsGuarantors.None OtherInstallmentPlans.None
## 3.4407069 3.4229365
## Purpose.UsedCar EmploymentDuration.4.to.7
## 3.2526862 3.0121485
## Purpose.Business SavingsAccountBonds.500.to.1000
## 2.8973105 2.7314685
## CheckingAccountStatus.gt.200 OtherInstallmentPlans.Bank
## 2.4870121 2.3750988
## Property.CarOther EmploymentDuration.gt.7
## 2.3092157 2.0745098
## Job.SkilledEmployee EmploymentDuration.1.to.4
## 1.9878199 1.7911160
## Purpose.Furniture.Equipment Purpose.Radio.Television
## 1.6561767 1.5369792
## CreditHistory.PaidDuly NumberExistingCredits
## 1.4949163 1.1574512
## Property.Insurance Property.RealEstate
## 1.1335296 0.8633578
## ResidenceDuration Job.Management.SelfEmp.HighlyQualified
## 0.8574781 0.7202881
## Personal.Female.NotSingle Personal.Male.Single
## 0.5866667 0.5866667
## Telephone CreditHistory.Delay
## 0.5866667 0.3548387
## Personal.Male.Divorced.Seperated Housing.Rent
## 0.3548387 0.2993096
## Job.UnskilledResident
## 0.1388889
##################################
# Plotting the RPART model structure
##################################
fancyRpartPlot(rpartFit.Apparent.CP.005.Pruned, caption = NULL)
##################################
# Evaluating the RPART model
# on the train set
##################################
$PredClass.CP.005 <- predict(rpartFit.Apparent.CP.005.Pruned,
MA_Train.Evaluatednewdata = MA_Train,
type = "class")
$PredCorrect.CP.005 <- ifelse(MA_Train.Evaluated$Class==MA_Train.Evaluated$PredClass.CP.005,1,0)
MA_Train.Evaluated
##################################
# Computing for the
# apparent model performance
##################################
.005 <- mean(MA_Train.Evaluated$PredCorrect.CP.005)) (Train.CP
## [1] 0.83375
##################################
# Evaluating the RPART model
# on the test set
##################################
$PredClass.CP.005 <- predict(rpartFit.Apparent.CP.005.Pruned,
MA_Test.Evaluatednewdata = MA_Test,
type = "class")
$PredCorrect.CP.005 <- ifelse(MA_Test.Evaluated$Class==MA_Test.Evaluated$PredClass.CP.005,1,0)
MA_Test.Evaluated
##################################
# Computing for the
# external validation model performance
##################################
.005 <- mean(MA_Test.Evaluated$PredCorrect.CP.005)) (Test.CP
## [1] 0.745
##################################
# Formulating the RPART model
# using a complexity parameter setting
# equal to 0.010
#################################
.010 = rpart(Class ~ .,
rpartFit.Apparent.CPdata = MA_Train,
control = rpart.control(cp = 0.010))
printcp(rpartFit.Apparent.CP.010)
##
## Classification tree:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] Age Amount
## [3] CheckingAccountStatus.0.to.200 CheckingAccountStatus.gt.200
## [5] CheckingAccountStatus.none Duration
## [7] OtherDebtorsGuarantors.Guarantor Purpose.NewCar
## [9] Purpose.UsedCar SavingsAccountBonds.lt.100
##
## Root node error: 240/800 = 0.3
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.029167 0 1.00000 1.00000 0.054006
## 2 0.027083 5 0.84167 0.97917 0.053679
## 3 0.020833 7 0.78750 0.97083 0.053544
## 4 0.016667 8 0.76667 0.92083 0.052696
## 5 0.014167 9 0.75000 0.86250 0.051613
## 6 0.010000 15 0.65833 0.87500 0.051854
##################################
# Pruning the model
##################################
010.Pruned <- prune(rpartFit.Apparent.CP.010, cp = 0.010)
rpartFit.Apparent.CP.
010.Pruned.Summary <- summary(rpartFit.Apparent.CP.010.Pruned) rpartFit.Apparent.CP.
## Call:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.01))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.02916667 0 1.0000000 1.0000000 0.05400617
## 2 0.02708333 5 0.8416667 0.9791667 0.05367869
## 3 0.02083333 7 0.7875000 0.9708333 0.05354430
## 4 0.01666667 8 0.7666667 0.9208333 0.05269619
## 5 0.01416667 9 0.7500000 0.8625000 0.05161266
## 6 0.01000000 15 0.6583333 0.8750000 0.05185366
##
## Variable importance
## CheckingAccountStatus.none Duration
## 26 16
## Amount SavingsAccountBonds.lt.100
## 11 8
## CheckingAccountStatus.0.to.200 SavingsAccountBonds.Unknown
## 8 4
## SavingsAccountBonds.100.to.500 Age
## 4 4
## OtherDebtorsGuarantors.Guarantor Purpose.NewCar
## 4 3
## Purpose.UsedCar OtherDebtorsGuarantors.None
## 2 2
## CheckingAccountStatus.gt.200 CreditHistory.Critical
## 2 1
## SavingsAccountBonds.500.to.1000 Job.SkilledEmployee
## 1 1
## InstallmentRatePercentage EmploymentDuration.1.to.4
## 1 1
## EmploymentDuration.4.to.7
## 1
##
## Node number 1: 800 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.3 P(node) =1
## class counts: 240 560
## probabilities: 0.300 0.700
## left son=2 (484 obs) right son=3 (316 obs)
## Primary splits:
## CheckingAccountStatus.none splits as LR, improve=36.169470, (0 missing)
## CreditHistory.Critical splits as LR, improve=14.535700, (0 missing)
## Amount < 10918 to the right, improve=13.690600, (0 missing)
## Duration < 26.5 to the right, improve=10.957470, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve= 9.839849, (0 missing)
## Surrogate splits:
## CheckingAccountStatus.0.to.200 splits as RL, agree=0.660, adj=0.139, (0 split)
## CreditHistory.Critical splits as LR, agree=0.627, adj=0.057, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.626, adj=0.054, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.624, adj=0.047, (0 split)
## SavingsAccountBonds.lt.100 splits as RL, agree=0.623, adj=0.044, (0 split)
##
## Node number 2: 484 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4214876 P(node) =0.605
## class counts: 204 280
## probabilities: 0.421 0.579
## left son=4 (404 obs) right son=5 (80 obs)
## Primary splits:
## Duration < 11.5 to the right, improve=9.403355, (0 missing)
## Amount < 10841.5 to the right, improve=9.011413, (0 missing)
## CreditHistory.Critical splits as LR, improve=8.753453, (0 missing)
## Property.RealEstate splits as LR, improve=5.148565, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.821222, (0 missing)
## Surrogate splits:
## Age < 66.5 to the left, agree=0.845, adj=0.063, (0 split)
## Amount < 527.5 to the right, agree=0.841, adj=0.038, (0 split)
##
## Node number 3: 316 observations
## predicted class=Good expected loss=0.1139241 P(node) =0.395
## class counts: 36 280
## probabilities: 0.114 0.886
##
## Node number 4: 404 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4653465 P(node) =0.505
## class counts: 188 216
## probabilities: 0.465 0.535
## left son=8 (20 obs) right son=9 (384 obs)
## Primary splits:
## Amount < 10841.5 to the right, improve=6.226578, (0 missing)
## Duration < 47.5 to the right, improve=5.986296, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.939718, (0 missing)
## CreditHistory.Critical splits as LR, improve=4.354026, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=4.296602, (0 missing)
##
## Node number 5: 80 observations
## predicted class=Good expected loss=0.2 P(node) =0.1
## class counts: 16 64
## probabilities: 0.200 0.800
##
## Node number 8: 20 observations
## predicted class=Bad expected loss=0.15 P(node) =0.025
## class counts: 17 3
## probabilities: 0.850 0.150
##
## Node number 9: 384 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4453125 P(node) =0.48
## class counts: 171 213
## probabilities: 0.445 0.555
## left son=18 (88 obs) right son=19 (296 obs)
## Primary splits:
## Purpose.NewCar splits as RL, improve=4.114059, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.110862, (0 missing)
## Amount < 1381.5 to the left, improve=4.104405, (0 missing)
## Duration < 47.5 to the right, improve=4.095170, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.024666, (0 missing)
##
## Node number 18: 88 observations, complexity param=0.02708333
## predicted class=Bad expected loss=0.4204545 P(node) =0.11
## class counts: 51 37
## probabilities: 0.580 0.420
## left son=36 (33 obs) right son=37 (55 obs)
## Primary splits:
## Amount < 1392 to the left, improve=4.583333, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.207792, (0 missing)
## Age < 29.5 to the left, improve=2.506499, (0 missing)
## Duration < 22.5 to the right, improve=2.480381, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.376720, (0 missing)
## Surrogate splits:
## Duration < 13 to the left, agree=0.693, adj=0.182, (0 split)
## Age < 45.5 to the right, agree=0.670, adj=0.121, (0 split)
## Property.RealEstate splits as RL, agree=0.659, adj=0.091, (0 split)
## Job.UnskilledResident splits as RL, agree=0.636, adj=0.030, (0 split)
##
## Node number 19: 296 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4054054 P(node) =0.37
## class counts: 120 176
## probabilities: 0.405 0.595
## left son=38 (26 obs) right son=39 (270 obs)
## Primary splits:
## Duration < 46.5 to the right, improve=4.692446, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=3.771026, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.897053, (0 missing)
## Amount < 4038.5 to the right, improve=2.660474, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.495429, (0 missing)
##
## Node number 36: 33 observations
## predicted class=Bad expected loss=0.2121212 P(node) =0.04125
## class counts: 26 7
## probabilities: 0.788 0.212
##
## Node number 37: 55 observations, complexity param=0.02708333
## predicted class=Good expected loss=0.4545455 P(node) =0.06875
## class counts: 25 30
## probabilities: 0.455 0.545
## left son=74 (26 obs) right son=75 (29 obs)
## Primary splits:
## Duration < 22.5 to the right, improve=3.917290, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.563050, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.629870, (0 missing)
## Age < 29.5 to the left, improve=1.878706, (0 missing)
## Amount < 3904.5 to the right, improve=1.838554, (0 missing)
## Surrogate splits:
## Amount < 2674.5 to the right, agree=0.655, adj=0.269, (0 split)
## InstallmentRatePercentage < 3.5 to the right, agree=0.655, adj=0.269, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.618, adj=0.192, (0 split)
## EmploymentDuration.4.to.7 splits as RL, agree=0.618, adj=0.192, (0 split)
## Age < 27.5 to the left, agree=0.600, adj=0.154, (0 split)
##
## Node number 38: 26 observations, complexity param=0.01666667
## predicted class=Bad expected loss=0.3076923 P(node) =0.0325
## class counts: 18 8
## probabilities: 0.692 0.308
## left son=76 (18 obs) right son=77 (8 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=4.521368, (0 missing)
## Personal.Male.Single splits as LR, improve=2.188034, (0 missing)
## ResidenceDuration < 3.5 to the right, improve=2.155711, (0 missing)
## EmploymentDuration.gt.7 splits as RL, improve=1.813765, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=1.230769, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.Unknown splits as LR, agree=0.885, adj=0.625, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.808, adj=0.375, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.808, adj=0.375, (0 split)
## Job.SkilledEmployee splits as RL, agree=0.769, adj=0.250, (0 split)
## Amount < 4143.5 to the right, agree=0.731, adj=0.125, (0 split)
##
## Node number 39: 270 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.3777778 P(node) =0.3375
## class counts: 102 168
## probabilities: 0.378 0.622
## left son=78 (249 obs) right son=79 (21 obs)
## Primary splits:
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.964314, (0 missing)
## Purpose.UsedCar splits as LR, improve=3.008333, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.748558, (0 missing)
## OtherDebtorsGuarantors.None splits as RL, improve=2.377253, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.202868, (0 missing)
## Surrogate splits:
## OtherDebtorsGuarantors.None splits as RL, agree=0.963, adj=0.524, (0 split)
##
## Node number 74: 26 observations, complexity param=0.02083333
## predicted class=Bad expected loss=0.3461538 P(node) =0.0325
## class counts: 17 9
## probabilities: 0.654 0.346
## left son=148 (17 obs) right son=149 (9 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=5.128708, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.769231, (0 missing)
## Age < 29 to the left, improve=2.484382, (0 missing)
## Telephone splits as RL, improve=1.207139, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.207139, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.885, adj=0.667, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.769, adj=0.333, (0 split)
## Duration < 47.5 to the left, agree=0.731, adj=0.222, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.731, adj=0.222, (0 split)
## Amount < 5897 to the left, agree=0.692, adj=0.111, (0 split)
##
## Node number 75: 29 observations
## predicted class=Good expected loss=0.2758621 P(node) =0.03625
## class counts: 8 21
## probabilities: 0.276 0.724
##
## Node number 76: 18 observations
## predicted class=Bad expected loss=0.1111111 P(node) =0.0225
## class counts: 16 2
## probabilities: 0.889 0.111
##
## Node number 77: 8 observations
## predicted class=Good expected loss=0.25 P(node) =0.01
## class counts: 2 6
## probabilities: 0.250 0.750
##
## Node number 78: 249 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4056225 P(node) =0.31125
## class counts: 101 148
## probabilities: 0.406 0.594
## left son=156 (221 obs) right son=157 (28 obs)
## Primary splits:
## Purpose.UsedCar splits as LR, improve=3.252686, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.954286, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=2.289768, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.211870, (0 missing)
## Amount < 1367.5 to the left, improve=2.089978, (0 missing)
## Surrogate splits:
## Amount < 8877 to the left, agree=0.892, adj=0.036, (0 split)
##
## Node number 79: 21 observations
## predicted class=Good expected loss=0.04761905 P(node) =0.02625
## class counts: 1 20
## probabilities: 0.048 0.952
##
## Node number 148: 17 observations
## predicted class=Bad expected loss=0.1176471 P(node) =0.02125
## class counts: 15 2
## probabilities: 0.882 0.118
##
## Node number 149: 9 observations
## predicted class=Good expected loss=0.2222222 P(node) =0.01125
## class counts: 2 7
## probabilities: 0.222 0.778
##
## Node number 156: 221 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4343891 P(node) =0.27625
## class counts: 96 125
## probabilities: 0.434 0.566
## left son=312 (192 obs) right son=313 (29 obs)
## Primary splits:
## CheckingAccountStatus.gt.200 splits as LR, improve=2.487012, (0 missing)
## InstallmentRatePercentage < 3.5 to the right, improve=2.102047, (0 missing)
## Duration < 25.5 to the right, improve=1.712852, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.665913, (0 missing)
## Amount < 4038.5 to the right, improve=1.635174, (0 missing)
##
## Node number 157: 28 observations
## predicted class=Good expected loss=0.1785714 P(node) =0.035
## class counts: 5 23
## probabilities: 0.179 0.821
##
## Node number 312: 192 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.4635417 P(node) =0.24
## class counts: 89 103
## probabilities: 0.464 0.536
## left son=624 (104 obs) right son=625 (88 obs)
## Primary splits:
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.547276, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.533350, (0 missing)
## Duration < 28.5 to the right, improve=2.331440, (0 missing)
## Amount < 4038.5 to the right, improve=1.903117, (0 missing)
## Personal.Male.Divorced.Seperated splits as RL, improve=1.898893, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.lt.100 splits as RL, agree=0.646, adj=0.227, (0 split)
## Age < 31.5 to the left, agree=0.625, adj=0.182, (0 split)
## Purpose.Business splits as LR, agree=0.609, adj=0.148, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.604, adj=0.136, (0 split)
## Purpose.Furniture.Equipment splits as RL, agree=0.589, adj=0.102, (0 split)
##
## Node number 313: 29 observations
## predicted class=Good expected loss=0.2413793 P(node) =0.03625
## class counts: 7 22
## probabilities: 0.241 0.759
##
## Node number 624: 104 observations, complexity param=0.01416667
## predicted class=Bad expected loss=0.4615385 P(node) =0.13
## class counts: 56 48
## probabilities: 0.538 0.462
## left son=1248 (73 obs) right son=1249 (31 obs)
## Primary splits:
## Duration < 16.5 to the right, improve=2.978211, (0 missing)
## Amount < 4276.5 to the right, improve=1.692308, (0 missing)
## CreditHistory.Critical splits as LR, improve=1.398866, (0 missing)
## Age < 54 to the left, improve=1.258265, (0 missing)
## Purpose.Business splits as LR, improve=1.258265, (0 missing)
## Surrogate splits:
## Amount < 1119.5 to the right, agree=0.827, adj=0.419, (0 split)
##
## Node number 625: 88 observations, complexity param=0.01416667
## predicted class=Good expected loss=0.375 P(node) =0.11
## class counts: 33 55
## probabilities: 0.375 0.625
## left son=1250 (9 obs) right son=1251 (79 obs)
## Primary splits:
## Age < 48.5 to the right, improve=3.252813, (0 missing)
## Job.Management.SelfEmp.HighlyQualified splits as RL, improve=2.475000, (0 missing)
## EmploymentDuration.4.to.7 splits as LR, improve=1.964286, (0 missing)
## Amount < 2168.5 to the left, improve=1.424663, (0 missing)
## Personal.Male.Single splits as LR, improve=1.281056, (0 missing)
##
## Node number 1248: 73 observations
## predicted class=Bad expected loss=0.3835616 P(node) =0.09125
## class counts: 45 28
## probabilities: 0.616 0.384
##
## Node number 1249: 31 observations
## predicted class=Good expected loss=0.3548387 P(node) =0.03875
## class counts: 11 20
## probabilities: 0.355 0.645
##
## Node number 1250: 9 observations
## predicted class=Bad expected loss=0.2222222 P(node) =0.01125
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 1251: 79 observations
## predicted class=Good expected loss=0.3291139 P(node) =0.09875
## class counts: 26 53
## probabilities: 0.329 0.671
##################################
# Identifying the most predictive variables
##################################
010.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 22.9643483
## Amount SavingsAccountBonds.lt.100
## 14.7173141 11.8314467
## CheckingAccountStatus.0.to.200 SavingsAccountBonds.Unknown
## 10.9886136 5.6824730
## SavingsAccountBonds.100.to.500 Age
## 5.4620072 5.4618791
## OtherDebtorsGuarantors.Guarantor Purpose.NewCar
## 4.9643144 4.1140587
## Purpose.UsedCar OtherDebtorsGuarantors.None
## 3.2526862 2.6003552
## CheckingAccountStatus.gt.200 CreditHistory.Critical
## 2.4870121 2.0602865
## SavingsAccountBonds.500.to.1000 Job.SkilledEmployee
## 1.9458261 1.1303419
## InstallmentRatePercentage EmploymentDuration.1.to.4
## 1.0546549 0.7533249
## EmploymentDuration.4.to.7 Property.RealEstate
## 0.7533249 0.4166667
## Purpose.Business Purpose.Furniture.Equipment
## 0.3763021 0.2605168
## Job.UnskilledResident
## 0.1388889
##################################
# Plotting the RPART model structure
##################################
fancyRpartPlot(rpartFit.Apparent.CP.010.Pruned, caption = NULL)
##################################
# Evaluating the RPART model
# on the train set
##################################
$PredClass.CP.010 <- predict(rpartFit.Apparent.CP.010.Pruned,
MA_Train.Evaluatednewdata = MA_Train,
type = "class")
$PredCorrect.CP.010 <- ifelse(MA_Train.Evaluated$Class==MA_Train.Evaluated$PredClass.CP.010,1,0)
MA_Train.Evaluated
##################################
# Computing for the
# apparent model performance
##################################
.010 <- mean(MA_Train.Evaluated$PredCorrect.CP.010)) (Train.CP
## [1] 0.8025
##################################
# Evaluating the RPART model
# on the test set
##################################
$PredClass.CP.010 <- predict(rpartFit.Apparent.CP.010.Pruned,
MA_Test.Evaluatednewdata = MA_Test,
type = "class")
$PredCorrect.CP.010 <- ifelse(MA_Test.Evaluated$Class==MA_Test.Evaluated$PredClass.CP.010,1,0)
MA_Test.Evaluated
##################################
# Computing for the
# external validation model performance
##################################
.010 <- mean(MA_Test.Evaluated$PredCorrect.CP.010)) (Test.CP
## [1] 0.74
##################################
# Formulating the RPART model
# using a complexity parameter setting
# equal to 0.015
#################################
.015 = rpart(Class ~ .,
rpartFit.Apparent.CPdata = MA_Train,
control = rpart.control(cp = 0.015))
printcp(rpartFit.Apparent.CP.015)
##
## Classification tree:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.015))
##
## Variables actually used in tree construction:
## [1] Amount CheckingAccountStatus.none
## [3] Duration Purpose.NewCar
## [5] SavingsAccountBonds.lt.100
##
## Root node error: 240/800 = 0.3
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.029167 0 1.00000 1.00000 0.054006
## 2 0.027083 5 0.84167 0.98333 0.053745
## 3 0.020833 7 0.78750 1.00417 0.054070
## 4 0.016667 8 0.76667 0.99583 0.053942
## 5 0.015000 9 0.75000 0.99167 0.053877
##################################
# Pruning the model
##################################
015.Pruned <- prune(rpartFit.Apparent.CP.015, cp = 0.015)
rpartFit.Apparent.CP.
015.Pruned.Summary <- summary(rpartFit.Apparent.CP.015.Pruned) rpartFit.Apparent.CP.
## Call:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.015))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.02916667 0 1.0000000 1.0000000 0.05400617
## 2 0.02708333 5 0.8416667 0.9833333 0.05374515
## 3 0.02083333 7 0.7875000 1.0041667 0.05407023
## 4 0.01666667 8 0.7666667 0.9958333 0.05394164
## 5 0.01500000 9 0.7500000 0.9916667 0.05387663
##
## Variable importance
## CheckingAccountStatus.none Duration
## 32 18
## Amount SavingsAccountBonds.lt.100
## 12 10
## CheckingAccountStatus.0.to.200 SavingsAccountBonds.Unknown
## 7 5
## SavingsAccountBonds.100.to.500 Purpose.NewCar
## 4 4
## CreditHistory.Critical SavingsAccountBonds.500.to.1000
## 2 2
## Age Job.SkilledEmployee
## 2 1
## InstallmentRatePercentage EmploymentDuration.1.to.4
## 1 1
## EmploymentDuration.4.to.7
## 1
##
## Node number 1: 800 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.3 P(node) =1
## class counts: 240 560
## probabilities: 0.300 0.700
## left son=2 (484 obs) right son=3 (316 obs)
## Primary splits:
## CheckingAccountStatus.none splits as LR, improve=36.169470, (0 missing)
## CreditHistory.Critical splits as LR, improve=14.535700, (0 missing)
## Amount < 10918 to the right, improve=13.690600, (0 missing)
## Duration < 26.5 to the right, improve=10.957470, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve= 9.839849, (0 missing)
## Surrogate splits:
## CheckingAccountStatus.0.to.200 splits as RL, agree=0.660, adj=0.139, (0 split)
## CreditHistory.Critical splits as LR, agree=0.627, adj=0.057, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.626, adj=0.054, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.624, adj=0.047, (0 split)
## SavingsAccountBonds.lt.100 splits as RL, agree=0.623, adj=0.044, (0 split)
##
## Node number 2: 484 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4214876 P(node) =0.605
## class counts: 204 280
## probabilities: 0.421 0.579
## left son=4 (404 obs) right son=5 (80 obs)
## Primary splits:
## Duration < 11.5 to the right, improve=9.403355, (0 missing)
## Amount < 10841.5 to the right, improve=9.011413, (0 missing)
## CreditHistory.Critical splits as LR, improve=8.753453, (0 missing)
## Property.RealEstate splits as LR, improve=5.148565, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.821222, (0 missing)
## Surrogate splits:
## Age < 66.5 to the left, agree=0.845, adj=0.063, (0 split)
## Amount < 527.5 to the right, agree=0.841, adj=0.038, (0 split)
##
## Node number 3: 316 observations
## predicted class=Good expected loss=0.1139241 P(node) =0.395
## class counts: 36 280
## probabilities: 0.114 0.886
##
## Node number 4: 404 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4653465 P(node) =0.505
## class counts: 188 216
## probabilities: 0.465 0.535
## left son=8 (20 obs) right son=9 (384 obs)
## Primary splits:
## Amount < 10841.5 to the right, improve=6.226578, (0 missing)
## Duration < 47.5 to the right, improve=5.986296, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.939718, (0 missing)
## CreditHistory.Critical splits as LR, improve=4.354026, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=4.296602, (0 missing)
##
## Node number 5: 80 observations
## predicted class=Good expected loss=0.2 P(node) =0.1
## class counts: 16 64
## probabilities: 0.200 0.800
##
## Node number 8: 20 observations
## predicted class=Bad expected loss=0.15 P(node) =0.025
## class counts: 17 3
## probabilities: 0.850 0.150
##
## Node number 9: 384 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4453125 P(node) =0.48
## class counts: 171 213
## probabilities: 0.445 0.555
## left son=18 (88 obs) right son=19 (296 obs)
## Primary splits:
## Purpose.NewCar splits as RL, improve=4.114059, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.110862, (0 missing)
## Amount < 1381.5 to the left, improve=4.104405, (0 missing)
## Duration < 47.5 to the right, improve=4.095170, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.024666, (0 missing)
##
## Node number 18: 88 observations, complexity param=0.02708333
## predicted class=Bad expected loss=0.4204545 P(node) =0.11
## class counts: 51 37
## probabilities: 0.580 0.420
## left son=36 (33 obs) right son=37 (55 obs)
## Primary splits:
## Amount < 1392 to the left, improve=4.583333, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.207792, (0 missing)
## Age < 29.5 to the left, improve=2.506499, (0 missing)
## Duration < 22.5 to the right, improve=2.480381, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.376720, (0 missing)
## Surrogate splits:
## Duration < 13 to the left, agree=0.693, adj=0.182, (0 split)
## Age < 45.5 to the right, agree=0.670, adj=0.121, (0 split)
## Property.RealEstate splits as RL, agree=0.659, adj=0.091, (0 split)
## Job.UnskilledResident splits as RL, agree=0.636, adj=0.030, (0 split)
##
## Node number 19: 296 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4054054 P(node) =0.37
## class counts: 120 176
## probabilities: 0.405 0.595
## left son=38 (26 obs) right son=39 (270 obs)
## Primary splits:
## Duration < 46.5 to the right, improve=4.692446, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=3.771026, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.897053, (0 missing)
## Amount < 4038.5 to the right, improve=2.660474, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.495429, (0 missing)
##
## Node number 36: 33 observations
## predicted class=Bad expected loss=0.2121212 P(node) =0.04125
## class counts: 26 7
## probabilities: 0.788 0.212
##
## Node number 37: 55 observations, complexity param=0.02708333
## predicted class=Good expected loss=0.4545455 P(node) =0.06875
## class counts: 25 30
## probabilities: 0.455 0.545
## left son=74 (26 obs) right son=75 (29 obs)
## Primary splits:
## Duration < 22.5 to the right, improve=3.917290, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.563050, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.629870, (0 missing)
## Age < 29.5 to the left, improve=1.878706, (0 missing)
## Amount < 3904.5 to the right, improve=1.838554, (0 missing)
## Surrogate splits:
## Amount < 2674.5 to the right, agree=0.655, adj=0.269, (0 split)
## InstallmentRatePercentage < 3.5 to the right, agree=0.655, adj=0.269, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.618, adj=0.192, (0 split)
## EmploymentDuration.4.to.7 splits as RL, agree=0.618, adj=0.192, (0 split)
## Age < 27.5 to the left, agree=0.600, adj=0.154, (0 split)
##
## Node number 38: 26 observations, complexity param=0.01666667
## predicted class=Bad expected loss=0.3076923 P(node) =0.0325
## class counts: 18 8
## probabilities: 0.692 0.308
## left son=76 (18 obs) right son=77 (8 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=4.521368, (0 missing)
## Personal.Male.Single splits as LR, improve=2.188034, (0 missing)
## ResidenceDuration < 3.5 to the right, improve=2.155711, (0 missing)
## EmploymentDuration.gt.7 splits as RL, improve=1.813765, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=1.230769, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.Unknown splits as LR, agree=0.885, adj=0.625, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.808, adj=0.375, (0 split)
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.808, adj=0.375, (0 split)
## Job.SkilledEmployee splits as RL, agree=0.769, adj=0.250, (0 split)
## Amount < 4143.5 to the right, agree=0.731, adj=0.125, (0 split)
##
## Node number 39: 270 observations
## predicted class=Good expected loss=0.3777778 P(node) =0.3375
## class counts: 102 168
## probabilities: 0.378 0.622
##
## Node number 74: 26 observations, complexity param=0.02083333
## predicted class=Bad expected loss=0.3461538 P(node) =0.0325
## class counts: 17 9
## probabilities: 0.654 0.346
## left son=148 (17 obs) right son=149 (9 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=5.128708, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.769231, (0 missing)
## Age < 29 to the left, improve=2.484382, (0 missing)
## Telephone splits as RL, improve=1.207139, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.207139, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.885, adj=0.667, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.769, adj=0.333, (0 split)
## Duration < 47.5 to the left, agree=0.731, adj=0.222, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.731, adj=0.222, (0 split)
## Amount < 5897 to the left, agree=0.692, adj=0.111, (0 split)
##
## Node number 75: 29 observations
## predicted class=Good expected loss=0.2758621 P(node) =0.03625
## class counts: 8 21
## probabilities: 0.276 0.724
##
## Node number 76: 18 observations
## predicted class=Bad expected loss=0.1111111 P(node) =0.0225
## class counts: 16 2
## probabilities: 0.889 0.111
##
## Node number 77: 8 observations
## predicted class=Good expected loss=0.25 P(node) =0.01
## class counts: 2 6
## probabilities: 0.250 0.750
##
## Node number 148: 17 observations
## predicted class=Bad expected loss=0.1176471 P(node) =0.02125
## class counts: 15 2
## probabilities: 0.882 0.118
##
## Node number 149: 9 observations
## predicted class=Good expected loss=0.2222222 P(node) =0.01125
## class counts: 2 7
## probabilities: 0.222 0.778
##################################
# Identifying the most predictive variables
##################################
015.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 19.9861370
## Amount SavingsAccountBonds.lt.100
## 13.3522194 11.2525205
## CheckingAccountStatus.0.to.200 SavingsAccountBonds.Unknown
## 8.4413380 5.6824730
## SavingsAccountBonds.100.to.500 Purpose.NewCar
## 5.1146514 4.1140587
## CreditHistory.Critical SavingsAccountBonds.500.to.1000
## 2.0602865 1.9458261
## Age Job.SkilledEmployee
## 1.7459252 1.1303419
## InstallmentRatePercentage EmploymentDuration.1.to.4
## 1.0546549 0.7533249
## EmploymentDuration.4.to.7 Property.RealEstate
## 0.7533249 0.4166667
## Job.UnskilledResident
## 0.1388889
##################################
# Plotting the RPART model structure
##################################
fancyRpartPlot(rpartFit.Apparent.CP.015.Pruned, caption = NULL)
##################################
# Evaluating the RPART model
# on the train set
##################################
$PredClass.CP.015 <- predict(rpartFit.Apparent.CP.015.Pruned,
MA_Train.Evaluatednewdata = MA_Train,
type = "class")
$PredCorrect.CP.015 <- ifelse(MA_Train.Evaluated$Class==MA_Train.Evaluated$PredClass.CP.015,1,0)
MA_Train.Evaluated
##################################
# Computing for the
# apparent model performance
##################################
.015 <- mean(MA_Train.Evaluated$PredCorrect.CP.015)) (Train.CP
## [1] 0.775
##################################
# Evaluating the RPART model
# on the test set
##################################
$PredClass.CP.015 <- predict(rpartFit.Apparent.CP.015.Pruned,
MA_Test.Evaluatednewdata = MA_Test,
type = "class")
$PredCorrect.CP.015 <- ifelse(MA_Test.Evaluated$Class==MA_Test.Evaluated$PredClass.CP.015,1,0)
MA_Test.Evaluated
##################################
# Computing for the
# external validation model performance
##################################
.015 <- mean(MA_Test.Evaluated$PredCorrect.CP.015)) (Test.CP
## [1] 0.73
##################################
# Formulating the RPART model
# using a complexity parameter setting
# equal to 0.020
#################################
.020 = rpart(Class ~ .,
rpartFit.Apparent.CPdata = MA_Train,
control = rpart.control(cp = 0.020))
printcp(rpartFit.Apparent.CP.020)
##
## Classification tree:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.02))
##
## Variables actually used in tree construction:
## [1] Amount CheckingAccountStatus.none
## [3] Duration Purpose.NewCar
## [5] SavingsAccountBonds.lt.100
##
## Root node error: 240/800 = 0.3
##
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.029167 0 1.00000 1.0000 0.054006
## 2 0.027083 5 0.84167 1.0208 0.054322
## 3 0.020833 7 0.78750 1.0000 0.054006
## 4 0.020000 8 0.76667 1.0000 0.054006
##################################
# Pruning the model
##################################
020.Pruned <- prune(rpartFit.Apparent.CP.020, cp = 0.020)
rpartFit.Apparent.CP.
020.Pruned.Summary <- summary(rpartFit.Apparent.CP.020.Pruned) rpartFit.Apparent.CP.
## Call:
## rpart(formula = Class ~ ., data = MA_Train, control = rpart.control(cp = 0.02))
## n= 800
##
## CP nsplit rel error xerror xstd
## 1 0.02916667 0 1.0000000 1.000000 0.05400617
## 2 0.02708333 5 0.8416667 1.020833 0.05432169
## 3 0.02083333 7 0.7875000 1.000000 0.05400617
## 4 0.02000000 8 0.7666667 1.000000 0.05400617
##
## Variable importance
## CheckingAccountStatus.none Duration
## 36 20
## Amount CheckingAccountStatus.0.to.200
## 13 7
## SavingsAccountBonds.lt.100 Purpose.NewCar
## 7 4
## SavingsAccountBonds.100.to.500 SavingsAccountBonds.Unknown
## 3 3
## CreditHistory.Critical SavingsAccountBonds.500.to.1000
## 2 2
## Age InstallmentRatePercentage
## 2 1
## EmploymentDuration.1.to.4 EmploymentDuration.4.to.7
## 1 1
##
## Node number 1: 800 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.3 P(node) =1
## class counts: 240 560
## probabilities: 0.300 0.700
## left son=2 (484 obs) right son=3 (316 obs)
## Primary splits:
## CheckingAccountStatus.none splits as LR, improve=36.169470, (0 missing)
## CreditHistory.Critical splits as LR, improve=14.535700, (0 missing)
## Amount < 10918 to the right, improve=13.690600, (0 missing)
## Duration < 26.5 to the right, improve=10.957470, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve= 9.839849, (0 missing)
## Surrogate splits:
## CheckingAccountStatus.0.to.200 splits as RL, agree=0.660, adj=0.139, (0 split)
## CreditHistory.Critical splits as LR, agree=0.627, adj=0.057, (0 split)
## SavingsAccountBonds.500.to.1000 splits as LR, agree=0.626, adj=0.054, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.624, adj=0.047, (0 split)
## SavingsAccountBonds.lt.100 splits as RL, agree=0.623, adj=0.044, (0 split)
##
## Node number 2: 484 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4214876 P(node) =0.605
## class counts: 204 280
## probabilities: 0.421 0.579
## left son=4 (404 obs) right son=5 (80 obs)
## Primary splits:
## Duration < 11.5 to the right, improve=9.403355, (0 missing)
## Amount < 10841.5 to the right, improve=9.011413, (0 missing)
## CreditHistory.Critical splits as LR, improve=8.753453, (0 missing)
## Property.RealEstate splits as LR, improve=5.148565, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=4.821222, (0 missing)
## Surrogate splits:
## Age < 66.5 to the left, agree=0.845, adj=0.063, (0 split)
## Amount < 527.5 to the right, agree=0.841, adj=0.038, (0 split)
##
## Node number 3: 316 observations
## predicted class=Good expected loss=0.1139241 P(node) =0.395
## class counts: 36 280
## probabilities: 0.114 0.886
##
## Node number 4: 404 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4653465 P(node) =0.505
## class counts: 188 216
## probabilities: 0.465 0.535
## left son=8 (20 obs) right son=9 (384 obs)
## Primary splits:
## Amount < 10841.5 to the right, improve=6.226578, (0 missing)
## Duration < 47.5 to the right, improve=5.986296, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.939718, (0 missing)
## CreditHistory.Critical splits as LR, improve=4.354026, (0 missing)
## CheckingAccountStatus.gt.200 splits as LR, improve=4.296602, (0 missing)
##
## Node number 5: 80 observations
## predicted class=Good expected loss=0.2 P(node) =0.1
## class counts: 16 64
## probabilities: 0.200 0.800
##
## Node number 8: 20 observations
## predicted class=Bad expected loss=0.15 P(node) =0.025
## class counts: 17 3
## probabilities: 0.850 0.150
##
## Node number 9: 384 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4453125 P(node) =0.48
## class counts: 171 213
## probabilities: 0.445 0.555
## left son=18 (88 obs) right son=19 (296 obs)
## Primary splits:
## Purpose.NewCar splits as RL, improve=4.114059, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=4.110862, (0 missing)
## Amount < 1381.5 to the left, improve=4.104405, (0 missing)
## Duration < 47.5 to the right, improve=4.095170, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.024666, (0 missing)
##
## Node number 18: 88 observations, complexity param=0.02708333
## predicted class=Bad expected loss=0.4204545 P(node) =0.11
## class counts: 51 37
## probabilities: 0.580 0.420
## left son=36 (33 obs) right son=37 (55 obs)
## Primary splits:
## Amount < 1392 to the left, improve=4.583333, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=4.207792, (0 missing)
## Age < 29.5 to the left, improve=2.506499, (0 missing)
## Duration < 22.5 to the right, improve=2.480381, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.376720, (0 missing)
## Surrogate splits:
## Duration < 13 to the left, agree=0.693, adj=0.182, (0 split)
## Age < 45.5 to the right, agree=0.670, adj=0.121, (0 split)
## Property.RealEstate splits as RL, agree=0.659, adj=0.091, (0 split)
## Job.UnskilledResident splits as RL, agree=0.636, adj=0.030, (0 split)
##
## Node number 19: 296 observations, complexity param=0.02916667
## predicted class=Good expected loss=0.4054054 P(node) =0.37
## class counts: 120 176
## probabilities: 0.405 0.595
## left son=38 (26 obs) right son=39 (270 obs)
## Primary splits:
## Duration < 46.5 to the right, improve=4.692446, (0 missing)
## OtherDebtorsGuarantors.Guarantor splits as LR, improve=3.771026, (0 missing)
## CreditHistory.Critical splits as LR, improve=2.897053, (0 missing)
## Amount < 4038.5 to the right, improve=2.660474, (0 missing)
## CheckingAccountStatus.0.to.200 splits as LR, improve=2.495429, (0 missing)
##
## Node number 36: 33 observations
## predicted class=Bad expected loss=0.2121212 P(node) =0.04125
## class counts: 26 7
## probabilities: 0.788 0.212
##
## Node number 37: 55 observations, complexity param=0.02708333
## predicted class=Good expected loss=0.4545455 P(node) =0.06875
## class counts: 25 30
## probabilities: 0.455 0.545
## left son=74 (26 obs) right son=75 (29 obs)
## Primary splits:
## Duration < 22.5 to the right, improve=3.917290, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.563050, (0 missing)
## SavingsAccountBonds.lt.100 splits as RL, improve=2.629870, (0 missing)
## Age < 29.5 to the left, improve=1.878706, (0 missing)
## Amount < 3904.5 to the right, improve=1.838554, (0 missing)
## Surrogate splits:
## Amount < 2674.5 to the right, agree=0.655, adj=0.269, (0 split)
## InstallmentRatePercentage < 3.5 to the right, agree=0.655, adj=0.269, (0 split)
## EmploymentDuration.1.to.4 splits as LR, agree=0.618, adj=0.192, (0 split)
## EmploymentDuration.4.to.7 splits as RL, agree=0.618, adj=0.192, (0 split)
## Age < 27.5 to the left, agree=0.600, adj=0.154, (0 split)
##
## Node number 38: 26 observations
## predicted class=Bad expected loss=0.3076923 P(node) =0.0325
## class counts: 18 8
## probabilities: 0.692 0.308
##
## Node number 39: 270 observations
## predicted class=Good expected loss=0.3777778 P(node) =0.3375
## class counts: 102 168
## probabilities: 0.378 0.622
##
## Node number 74: 26 observations, complexity param=0.02083333
## predicted class=Bad expected loss=0.3461538 P(node) =0.0325
## class counts: 17 9
## probabilities: 0.654 0.346
## left son=148 (17 obs) right son=149 (9 obs)
## Primary splits:
## SavingsAccountBonds.lt.100 splits as RL, improve=5.128708, (0 missing)
## InstallmentRatePercentage < 2.5 to the right, improve=3.769231, (0 missing)
## Age < 29 to the left, improve=2.484382, (0 missing)
## Telephone splits as RL, improve=1.207139, (0 missing)
## CreditHistory.PaidDuly splits as RL, improve=1.207139, (0 missing)
## Surrogate splits:
## SavingsAccountBonds.100.to.500 splits as LR, agree=0.885, adj=0.667, (0 split)
## CheckingAccountStatus.0.to.200 splits as LR, agree=0.769, adj=0.333, (0 split)
## Duration < 47.5 to the left, agree=0.731, adj=0.222, (0 split)
## SavingsAccountBonds.Unknown splits as LR, agree=0.731, adj=0.222, (0 split)
## Amount < 5897 to the left, agree=0.692, adj=0.111, (0 split)
##
## Node number 75: 29 observations
## predicted class=Good expected loss=0.2758621 P(node) =0.03625
## class counts: 8 21
## probabilities: 0.276 0.724
##
## Node number 148: 17 observations
## predicted class=Bad expected loss=0.1176471 P(node) =0.02125
## class counts: 15 2
## probabilities: 0.882 0.118
##
## Node number 149: 9 observations
## predicted class=Good expected loss=0.2222222 P(node) =0.01125
## class counts: 2 7
## probabilities: 0.222 0.778
##################################
# Identifying the most predictive variables
##################################
020.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 19.9861370
## Amount CheckingAccountStatus.0.to.200
## 12.7870484 6.7458251
## SavingsAccountBonds.lt.100 Purpose.NewCar
## 6.7311529 4.1140587
## SavingsAccountBonds.100.to.500 SavingsAccountBonds.Unknown
## 3.4191386 2.8566183
## CreditHistory.Critical SavingsAccountBonds.500.to.1000
## 2.0602865 1.9458261
## Age InstallmentRatePercentage
## 1.7459252 1.0546549
## EmploymentDuration.1.to.4 EmploymentDuration.4.to.7
## 0.7533249 0.7533249
## Property.RealEstate Job.UnskilledResident
## 0.4166667 0.1388889
##################################
# Plotting the RPART model structure
##################################
fancyRpartPlot(rpartFit.Apparent.CP.020.Pruned, caption = NULL)
##################################
# Evaluating the RPART model
# on the train set
##################################
$PredClass.CP.020 <- predict(rpartFit.Apparent.CP.020.Pruned,
MA_Train.Evaluatednewdata = MA_Train,
type = "class")
$PredCorrect.CP.020 <- ifelse(MA_Train.Evaluated$Class==MA_Train.Evaluated$PredClass.CP.020,1,0)
MA_Train.Evaluated
##################################
# Computing for the
# apparent model performance
##################################
.020 <- mean(MA_Train.Evaluated$PredCorrect.CP.020)) (Train.CP
## [1] 0.77
##################################
# Evaluating the RPART model
# on the test set
##################################
$PredClass.CP.020 <- predict(rpartFit.Apparent.CP.020.Pruned,
MA_Test.Evaluatednewdata = MA_Test,
type = "class")
$PredCorrect.CP.020 <- ifelse(MA_Test.Evaluated$Class==MA_Test.Evaluated$PredClass.CP.020,1,0)
MA_Test.Evaluated
##################################
# Computing for the
# external validation model performance
##################################
.020 <- mean(MA_Test.Evaluated$PredCorrect.CP.020)) (Test.CP
## [1] 0.725
################################################################################
# Plotting the apparent and external validation model performance
################################################################################
<- c(Train.CP.001,Train.CP.005,Train.CP.010,Train.CP.015,Train.CP.020)
Apparent <- c(Test.CP.001,Test.CP.005,Test.CP.010,Test.CP.015,Test.CP.020)
Test <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- Apparent
Performance <- c(rep("AV: Apparent",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance))
ApparentPerformance_Data $Performance <- as.numeric(as.character(ApparentPerformance_Data$Performance))
ApparentPerformance_Data$Complexity_Parameter <- factor(ApparentPerformance_Data$Complexity_Parameter,
ApparentPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(ApparentPerformance_Data)
## 'data.frame': 5 obs. of 3 variables:
## $ Validation : chr "AV: Apparent" "AV: Apparent" "AV: Apparent" "AV: Apparent" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.84 0.834 0.802 0.775 0.77
ApparentPerformance_Data
## Validation Complexity_Parameter Performance
## 1 AV: Apparent 0.001 0.84000
## 2 AV: Apparent 0.005 0.83375
## 3 AV: Apparent 0.010 0.80250
## 4 AV: Apparent 0.015 0.77500
## 5 AV: Apparent 0.020 0.77000
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(ApparentPerformance_Plot data = ApparentPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="blue",
col.line="blue",
cex=2)
}))
<- Test
Performance <- c(rep("EV_SSHV: External - Split-Sample",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance))
TestPerformance_Data $Performance <- as.numeric(as.character(TestPerformance_Data$Performance))
TestPerformance_Data$Complexity_Parameter <- factor(TestPerformance_Data$Complexity_Parameter,
TestPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(TestPerformance_Data)
## 'data.frame': 5 obs. of 3 variables:
## $ Validation : chr "EV_SSHV: External - Split-Sample" "EV_SSHV: External - Split-Sample" "EV_SSHV: External - Split-Sample" "EV_SSHV: External - Split-Sample" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.73 0.745 0.74 0.73 0.725
TestPerformance_Data
## Validation Complexity_Parameter Performance
## 1 EV_SSHV: External - Split-Sample 0.001 0.730
## 2 EV_SSHV: External - Split-Sample 0.005 0.745
## 3 EV_SSHV: External - Split-Sample 0.010 0.740
## 4 EV_SSHV: External - Split-Sample 0.015 0.730
## 5 EV_SSHV: External - Split-Sample 0.020 0.725
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(TestPerformance_Plot data = TestPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y, type=c("p","l"),
pch=16,
col.symbol="red",
col.line="red",
cex=2)
}))
################################################################################
# Applying k-fold cross validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid .5 <- train(Class ~ .,
rpartFit.CVdata = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "cv",
number = 5,
classProbs = TRUE))
.5$results rpartFit.CV
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001 0.69375 0.2433436 0.02688227 0.05950434
## 2 0.005 0.69500 0.2418830 0.02911454 0.06405609
## 3 0.010 0.70500 0.2308807 0.03168448 0.06750406
## 4 0.015 0.70375 0.2135092 0.02404423 0.05470701
## 5 0.020 0.69250 0.1845879 0.02436699 0.05604208
################################################################################
# Plotting the internal model validation performance
# Applying k-fold cross validation
################################################################################
.5 <- rpartFit.CV.5$results$Accuracy
CV5.UCL <- rpartFit.CV.5$results$Accuracy+2*rpartFit.CV.5$results$AccuracySD
CV.5.LCL <- rpartFit.CV.5$results$Accuracy-2*rpartFit.CV.5$results$AccuracySD
CV.<- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- CV.5
Performance <- c(rep("IV_KFCV: Internal - K-Fold CV",5))
Validation .5Performance_Data <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,CV.5.UCL,CV.5.LCL))
CV.5Performance_Data$Performance <- as.numeric(as.character(CV.5Performance_Data$Performance))
CV.5Performance_Data$CV.5.UCL <- as.numeric(as.character(CV.5Performance_Data$CV.5.UCL))
CV.5Performance_Data$CV.5.LCL <- as.numeric(as.character(CV.5Performance_Data$CV.5.LCL))
CV.5Performance_Data$Complexity_Parameter <- factor(CV.5Performance_Data$Complexity_Parameter,
CVlevels=c("0.001","0.005","0.010","0.015","0.020"))
str(CV.5Performance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_KFCV: Internal - K-Fold CV" "IV_KFCV: Internal - K-Fold CV" "IV_KFCV: Internal - K-Fold CV" "IV_KFCV: Internal - K-Fold CV" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.694 0.695 0.705 0.704 0.692
## $ CV.5.UCL : num 0.748 0.753 0.768 0.752 0.741
## $ CV.5.LCL : num 0.64 0.637 0.642 0.656 0.644
.5Performance_Data CV
## Validation Complexity_Parameter Performance CV.5.UCL
## 1 IV_KFCV: Internal - K-Fold CV 0.001 0.69375 0.7475145
## 2 IV_KFCV: Internal - K-Fold CV 0.005 0.69500 0.7532291
## 3 IV_KFCV: Internal - K-Fold CV 0.010 0.70500 0.7683690
## 4 IV_KFCV: Internal - K-Fold CV 0.015 0.70375 0.7518385
## 5 IV_KFCV: Internal - K-Fold CV 0.020 0.69250 0.7412340
## CV.5.LCL
## 1 0.6399855
## 2 0.6367709
## 3 0.6416310
## 4 0.6556615
## 5 0.6437660
.5Performance_Plot <- xyplot(Performance ~ Complexity_Parameter | Validation,
(CVdata = CV.5Performance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = CV.5Performance_Data$CV.5.LCL,
y1 = CV.5Performance_Data$CV.5.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Applying repeated k-fold cross validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
.10 <- train(Class ~ .,
rpartFit.RCVdata = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "repeatedcv",
repeats = 10,
number = 5,
classProbs = TRUE))
.10$results rpartFit.RCV
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001 0.706625 0.2753633 0.03288843 0.07591095
## 2 0.005 0.709375 0.2759455 0.03341362 0.07762500
## 3 0.010 0.720125 0.2801409 0.03106553 0.08147916
## 4 0.015 0.715375 0.2590690 0.03162303 0.08157790
## 5 0.020 0.712375 0.2380722 0.03267150 0.08628021
################################################################################
# Plotting the internal model validation performance
# Applying repeated k-fold cross validation
################################################################################
.10 <- rpartFit.RCV.10$results$Accuracy
RCV10.UCL <- rpartFit.RCV.10$results$Accuracy+2*rpartFit.RCV.10$results$AccuracySD
RCV.10.LCL <- rpartFit.RCV.10$results$Accuracy-2*rpartFit.RCV.10$results$AccuracySD
RCV.<- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- RCV.10
Performance <- c(rep("IV_RKFCV: Internal - Repeated K-Fold CV",5))
Validation .10Performance_Data <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,RCV.10.UCL,RCV.10.LCL))
RCV.10Performance_Data$Performance <- as.numeric(as.character(RCV.10Performance_Data$Performance))
RCV.10Performance_Data$RCV.10.UCL <- as.numeric(as.character(RCV.10Performance_Data$RCV.10.UCL))
RCV.10Performance_Data$RCV.10.LCL <- as.numeric(as.character(RCV.10Performance_Data$RCV.10.LCL))
RCV.10Performance_Data$Complexity_Parameter <- factor(RCV.10Performance_Data$Complexity_Parameter,
RCVlevels=c("0.001","0.005","0.010","0.015","0.020"))
str(RCV.10Performance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_RKFCV: Internal - Repeated K-Fold CV" "IV_RKFCV: Internal - Repeated K-Fold CV" "IV_RKFCV: Internal - Repeated K-Fold CV" "IV_RKFCV: Internal - Repeated K-Fold CV" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.707 0.709 0.72 0.715 0.712
## $ RCV.10.UCL : num 0.772 0.776 0.782 0.779 0.778
## $ RCV.10.LCL : num 0.641 0.643 0.658 0.652 0.647
.10Performance_Data RCV
## Validation Complexity_Parameter Performance
## 1 IV_RKFCV: Internal - Repeated K-Fold CV 0.001 0.706625
## 2 IV_RKFCV: Internal - Repeated K-Fold CV 0.005 0.709375
## 3 IV_RKFCV: Internal - Repeated K-Fold CV 0.010 0.720125
## 4 IV_RKFCV: Internal - Repeated K-Fold CV 0.015 0.715375
## 5 IV_RKFCV: Internal - Repeated K-Fold CV 0.020 0.712375
## RCV.10.UCL RCV.10.LCL
## 1 0.7724019 0.6408481
## 2 0.7762022 0.6425478
## 3 0.7822561 0.6579939
## 4 0.7786211 0.6521289
## 5 0.7777180 0.6470320
.10Performance_Plot <- xyplot(Performance ~ Complexity_Parameter | Validation,
(RCVdata = RCV.10Performance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = RCV.10Performance_Data$RCV.10.LCL,
y1 = RCV.10Performance_Data$RCV.10.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Applying leave-one-out cross validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
<- train(Class ~ .,
rpartFit.LOOCV data = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "LOOCV",
classProbs = TRUE))
$results rpartFit.LOOCV
## cp Accuracy Kappa
## 1 0.001 0.65875 0.15217391
## 2 0.005 0.67375 0.16878981
## 3 0.010 0.66000 0.11458333
## 4 0.015 0.61875 -0.04595336
## 5 0.020 0.65750 0.02698864
################################################################################
# Plotting the internal model validation performance
# Applying leave-one-out cross validation
################################################################################
<- rpartFit.LOOCV$results$Accuracy
LOOCV <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- LOOCV
Performance <- c(rep("IV_LOOCV: Internal - LOOCV",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance))
LOOCVPerformance_Data $Performance <- as.numeric(as.character(LOOCVPerformance_Data$Performance))
LOOCVPerformance_Data$Complexity_Parameter <- factor(LOOCVPerformance_Data$Complexity_Parameter,
LOOCVPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(LOOCVPerformance_Data)
## 'data.frame': 5 obs. of 3 variables:
## $ Validation : chr "IV_LOOCV: Internal - LOOCV" "IV_LOOCV: Internal - LOOCV" "IV_LOOCV: Internal - LOOCV" "IV_LOOCV: Internal - LOOCV" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.659 0.674 0.66 0.619 0.657
LOOCVPerformance_Data
## Validation Complexity_Parameter Performance
## 1 IV_LOOCV: Internal - LOOCV 0.001 0.65875
## 2 IV_LOOCV: Internal - LOOCV 0.005 0.67375
## 3 IV_LOOCV: Internal - LOOCV 0.010 0.66000
## 4 IV_LOOCV: Internal - LOOCV 0.015 0.61875
## 5 IV_LOOCV: Internal - LOOCV 0.020 0.65750
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(LOOCVPerformance_Plot data = LOOCVPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
}))
################################################################################
# Applying leave-group-out cross validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
<- train(Class ~ .,
rpartFit.LGOCV data = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "LGOCV",
number = 500,
p = 0.70,
classProbs = TRUE))
$results rpartFit.LGOCV
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001 0.7017917 0.2620663 0.02806485 0.06606339
## 2 0.005 0.7060333 0.2648599 0.02773508 0.06752638
## 3 0.010 0.7121250 0.2632998 0.02485831 0.06411238
## 4 0.015 0.7122667 0.2506467 0.02333020 0.06424203
## 5 0.020 0.7110917 0.2344573 0.02336108 0.07288177
################################################################################
# Plotting the internal model validation performance
# Applying leave-group-out cross validation
################################################################################
<- rpartFit.LGOCV$results$Accuracy
LGOCV <- rpartFit.LGOCV$results$Accuracy+2*rpartFit.LGOCV$results$AccuracySD
LGOCV.UCL <- rpartFit.LGOCV$results$Accuracy-2*rpartFit.LGOCV$results$AccuracySD
LGOCV.LCL <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- LGOCV
Performance <- c(rep("IV_LGOCV: Internal - LGOCV",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,LGOCV.UCL,LGOCV.LCL))
LGOCVPerformance_Data $Performance <- as.numeric(as.character(LGOCVPerformance_Data$Performance))
LGOCVPerformance_Data$LGOCV.UCL <- as.numeric(as.character(LGOCVPerformance_Data$LGOCV.UCL))
LGOCVPerformance_Data$LGOCV.LCL <- as.numeric(as.character(LGOCVPerformance_Data$LGOCV.LCL))
LGOCVPerformance_Data$Complexity_Parameter <- factor(LGOCVPerformance_Data$Complexity_Parameter,
LGOCVPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(LGOCVPerformance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_LGOCV: Internal - LGOCV" "IV_LGOCV: Internal - LGOCV" "IV_LGOCV: Internal - LGOCV" "IV_LGOCV: Internal - LGOCV" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.702 0.706 0.712 0.712 0.711
## $ LGOCV.UCL : num 0.758 0.762 0.762 0.759 0.758
## $ LGOCV.LCL : num 0.646 0.651 0.662 0.666 0.664
LGOCVPerformance_Data
## Validation Complexity_Parameter Performance LGOCV.UCL
## 1 IV_LGOCV: Internal - LGOCV 0.001 0.7017917 0.7579214
## 2 IV_LGOCV: Internal - LGOCV 0.005 0.7060333 0.7615035
## 3 IV_LGOCV: Internal - LGOCV 0.010 0.7121250 0.7618416
## 4 IV_LGOCV: Internal - LGOCV 0.015 0.7122667 0.7589271
## 5 IV_LGOCV: Internal - LGOCV 0.020 0.7110917 0.7578138
## LGOCV.LCL
## 1 0.6456620
## 2 0.6505632
## 3 0.6624084
## 4 0.6656063
## 5 0.6643695
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(LGOCVPerformance_Plot data = LGOCVPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = LGOCVPerformance_Data$LGOCV.LCL,
y1 = LGOCVPerformance_Data$LGOCV.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Applying bootstrap validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
<- train(Class ~ .,
rpartFit.Boot data = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "boot",
number = 500,
classProbs = TRUE))
$results rpartFit.Boot
## cp Accuracy Kappa AccuracySD KappaSD
## 1 0.001 0.6909885 0.2503562 0.02809481 0.06377817
## 2 0.005 0.6974808 0.2571949 0.02781838 0.06370414
## 3 0.010 0.7053319 0.2617308 0.02628519 0.06188814
## 4 0.015 0.7111238 0.2615536 0.02548858 0.06498478
## 5 0.020 0.7123572 0.2546401 0.02533626 0.06716852
################################################################################
# Plotting the internal model validation performance
# Applying bootstrap validation
################################################################################
<- rpartFit.Boot$results$Accuracy
Boot <- rpartFit.Boot$results$Accuracy+2*rpartFit.Boot$results$AccuracySD
Boot.UCL <- rpartFit.Boot$results$Accuracy-2*rpartFit.Boot$results$AccuracySD
Boot.LCL <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- Boot
Performance <- c(rep("IV_BV: Internal - Bootstrap",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,Boot.UCL,Boot.LCL))
BootPerformance_Data $Performance <- as.numeric(as.character(BootPerformance_Data$Performance))
BootPerformance_Data$Boot.UCL <- as.numeric(as.character(BootPerformance_Data$Boot.UCL))
BootPerformance_Data$Boot.LCL <- as.numeric(as.character(BootPerformance_Data$Boot.LCL))
BootPerformance_Data$Complexity_Parameter <- factor(BootPerformance_Data$Complexity_Parameter,
BootPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(BootPerformance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_BV: Internal - Bootstrap" "IV_BV: Internal - Bootstrap" "IV_BV: Internal - Bootstrap" "IV_BV: Internal - Bootstrap" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.691 0.697 0.705 0.711 0.712
## $ Boot.UCL : num 0.747 0.753 0.758 0.762 0.763
## $ Boot.LCL : num 0.635 0.642 0.653 0.66 0.662
BootPerformance_Data
## Validation Complexity_Parameter Performance Boot.UCL
## 1 IV_BV: Internal - Bootstrap 0.001 0.6909885 0.7471781
## 2 IV_BV: Internal - Bootstrap 0.005 0.6974808 0.7531176
## 3 IV_BV: Internal - Bootstrap 0.010 0.7053319 0.7579023
## 4 IV_BV: Internal - Bootstrap 0.015 0.7111238 0.7621010
## 5 IV_BV: Internal - Bootstrap 0.020 0.7123572 0.7630297
## Boot.LCL
## 1 0.6347989
## 2 0.6418440
## 3 0.6527615
## 4 0.6601466
## 5 0.6616847
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(BootPerformance_Plot data = BootPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = BootPerformance_Data$Boot.LCL,
y1 = BootPerformance_Data$Boot.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Applying bootstrap 0.632 validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
<- train(Class ~ .,
rpartFit.Boot632 data = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "boot632",
number = 500,
classProbs = TRUE))
$results rpartFit.Boot632
## cp Accuracy Kappa AccuracySD KappaSD AccuracyApparent
## 1 0.001 0.7458068 0.3800784 0.02809481 0.06377817 0.84000
## 2 0.005 0.7476114 0.3730317 0.02781838 0.06370414 0.83375
## 3 0.010 0.7410780 0.3446075 0.02628519 0.06188814 0.80250
## 4 0.015 0.7346225 0.2925940 0.02548858 0.06498478 0.77500
## 5 0.020 0.7335628 0.2857039 0.02533626 0.06716852 0.77000
## KappaApparent
## 1 0.6029777
## 2 0.5720721
## 3 0.4870130
## 4 0.3459302
## 5 0.3390805
################################################################################
# Plotting the internal model validation performance
# Applying bootstrap 0.632 validation
################################################################################
<- rpartFit.Boot632$results$Accuracy
Boot632 <- rpartFit.Boot632$results$Accuracy+2*rpartFit.Boot632$results$AccuracySD
Boot632.UCL <- rpartFit.Boot632$results$Accuracy-2*rpartFit.Boot632$results$AccuracySD
Boot632.LCL <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- Boot632
Performance <- c(rep("IV_B632V: Internal - Bootstrap 0.632",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,Boot632.UCL,Boot632.LCL))
Boot632Performance_Data $Performance <- as.numeric(as.character(Boot632Performance_Data$Performance))
Boot632Performance_Data$Boot632.UCL <- as.numeric(as.character(Boot632Performance_Data$Boot632.UCL))
Boot632Performance_Data$Boot632.LCL <- as.numeric(as.character(Boot632Performance_Data$Boot632.LCL))
Boot632Performance_Data$Complexity_Parameter <- factor(Boot632Performance_Data$Complexity_Parameter,
Boot632Performance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(Boot632Performance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_B632V: Internal - Bootstrap 0.632" "IV_B632V: Internal - Bootstrap 0.632" "IV_B632V: Internal - Bootstrap 0.632" "IV_B632V: Internal - Bootstrap 0.632" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.746 0.748 0.741 0.735 0.734
## $ Boot632.UCL : num 0.802 0.803 0.794 0.786 0.784
## $ Boot632.LCL : num 0.69 0.692 0.689 0.684 0.683
Boot632Performance_Data
## Validation Complexity_Parameter Performance
## 1 IV_B632V: Internal - Bootstrap 0.632 0.001 0.7458068
## 2 IV_B632V: Internal - Bootstrap 0.632 0.005 0.7476114
## 3 IV_B632V: Internal - Bootstrap 0.632 0.010 0.7410780
## 4 IV_B632V: Internal - Bootstrap 0.632 0.015 0.7346225
## 5 IV_B632V: Internal - Bootstrap 0.632 0.020 0.7335628
## Boot632.UCL Boot632.LCL
## 1 0.8019964 0.6896172
## 2 0.8032482 0.6919747
## 3 0.7936484 0.6885077
## 4 0.7855997 0.6836454
## 5 0.7842353 0.6828903
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(Boot632Performance_Plot data = Boot632Performance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = Boot632Performance_Data$Boot632.LCL,
y1 = Boot632Performance_Data$Boot632.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Applying bootstrap with optimism estimation validation
################################################################################
set.seed(12345678)
<- expand.grid(cp=c(0.001,0.005,0.010,0.015,0.020))
grid
<- train(Class ~ .,
rpartFit.BootOptimism data = MA_Train,
method = "rpart",
tuneGrid = grid,
trControl = trainControl(method = "optimism_boot",
number = 500,
classProbs = TRUE))
$results rpartFit.BootOptimism
## cp Accuracy Kappa AccuracySD KappaSD AccuracyApparent
## 1 0.001 0.7547650 0.3962111 0.02809481 0.06377817 0.84000
## 2 0.005 0.7528225 0.3732562 0.02781838 0.06370414 0.83375
## 3 0.010 0.7306500 0.3068618 0.02628519 0.06188814 0.80250
## 4 0.015 0.7164050 0.1964290 0.02548858 0.06498478 0.77500
## 5 0.020 0.7214225 0.2141386 0.02533626 0.06716852 0.77000
## KappaApparent AccuracyOptimism KappaOptimism
## 1 0.6029777 -0.0852350 -0.2067665
## 2 0.5720721 -0.0809275 -0.1988158
## 3 0.4870130 -0.0718500 -0.1801512
## 4 0.3459302 -0.0585950 -0.1495012
## 5 0.3390805 -0.0485775 -0.1249418
################################################################################
# Plotting the internal model validation performance
# Applying bootstrap with optimism estimation validation
################################################################################
<- rpartFit.BootOptimism$results$Accuracy
BootOptimism <- rpartFit.BootOptimism$results$Accuracy+2*rpartFit.BootOptimism$results$AccuracySD
BootOptimism.UCL <- rpartFit.BootOptimism$results$Accuracy-2*rpartFit.BootOptimism$results$AccuracySD
BootOptimism.LCL <- c("0.001","0.005","0.010","0.015","0.020")
Complexity_Parameter
<- BootOptimism
Performance <- c(rep("IV_BVOE: Internal - Bootstrap Optimism",5))
Validation <- as.data.frame(cbind(Validation,Complexity_Parameter,Performance,BootOptimism.UCL,BootOptimism.LCL))
BootOptimismPerformance_Data $Performance <- as.numeric(as.character(BootOptimismPerformance_Data$Performance))
BootOptimismPerformance_Data$BootOptimism.UCL <- as.numeric(as.character(BootOptimismPerformance_Data$BootOptimism.UCL))
BootOptimismPerformance_Data$BootOptimism.LCL <- as.numeric(as.character(BootOptimismPerformance_Data$BootOptimism.LCL))
BootOptimismPerformance_Data$Complexity_Parameter <- factor(BootOptimismPerformance_Data$Complexity_Parameter,
BootOptimismPerformance_Datalevels=c("0.001","0.005","0.010","0.015","0.020"))
str(BootOptimismPerformance_Data)
## 'data.frame': 5 obs. of 5 variables:
## $ Validation : chr "IV_BVOE: Internal - Bootstrap Optimism" "IV_BVOE: Internal - Bootstrap Optimism" "IV_BVOE: Internal - Bootstrap Optimism" "IV_BVOE: Internal - Bootstrap Optimism" ...
## $ Complexity_Parameter: Factor w/ 5 levels "0.001","0.005",..: 1 2 3 4 5
## $ Performance : num 0.755 0.753 0.731 0.716 0.721
## $ BootOptimism.UCL : num 0.811 0.808 0.783 0.767 0.772
## $ BootOptimism.LCL : num 0.699 0.697 0.678 0.665 0.671
BootOptimismPerformance_Data
## Validation Complexity_Parameter Performance
## 1 IV_BVOE: Internal - Bootstrap Optimism 0.001 0.7547650
## 2 IV_BVOE: Internal - Bootstrap Optimism 0.005 0.7528225
## 3 IV_BVOE: Internal - Bootstrap Optimism 0.010 0.7306500
## 4 IV_BVOE: Internal - Bootstrap Optimism 0.015 0.7164050
## 5 IV_BVOE: Internal - Bootstrap Optimism 0.020 0.7214225
## BootOptimism.UCL BootOptimism.LCL
## 1 0.8109546 0.6985754
## 2 0.8084593 0.6971857
## 3 0.7832204 0.6780796
## 4 0.7673822 0.6654278
## 5 0.7720950 0.6707500
<- xyplot(Performance ~ Complexity_Parameter | Validation,
(BootOptimismPerformance_Plot data = BootOptimismPerformance_Data,
ylab = "Estimated Accuracy",
xlab = "Complexity Parameter",
auto.key = list(adj=1),
ylim = seq(0.60, 0.85, 0.05),
panel = function(x, y) {
panel.grid(h=4, v=4)
panel.xyplot(x,
y,type=c("p","l"),
pch=16,
col.symbol="black",
col.line="black",
cex=2)
panel.arrows(x0 = as.numeric(x),
x1 = as.numeric(x),
y0 = BootOptimismPerformance_Data$BootOptimism.LCL,
y1 = BootOptimismPerformance_Data$BootOptimism.UCL,
length = 0.10, angle = 90, code = 3, lend = 2)
}))
################################################################################
# Consolidating all model performance results
################################################################################
grid.arrange(ApparentPerformance_Plot,
TestPerformance_Plot, .5Performance_Plot,
CV.10Performance_Plot,
RCV
LOOCVPerformance_Plot,
LGOCVPerformance_Plot,
BootPerformance_Plot,
Boot632Performance_Plot,
BootOptimismPerformance_Plot,ncol = 3)
##################################
# Plotting the final RPART model
##################################
fancyRpartPlot(rpartFit.Apparent.CP.010.Pruned, caption = NULL)
##################################
# Identifying the most predictive variables
# for the final RPART model
##################################
010.Pruned.Summary$variable.importance rpartFit.Apparent.CP.
## CheckingAccountStatus.none Duration
## 36.1694738 22.9643483
## Amount SavingsAccountBonds.lt.100
## 14.7173141 11.8314467
## CheckingAccountStatus.0.to.200 SavingsAccountBonds.Unknown
## 10.9886136 5.6824730
## SavingsAccountBonds.100.to.500 Age
## 5.4620072 5.4618791
## OtherDebtorsGuarantors.Guarantor Purpose.NewCar
## 4.9643144 4.1140587
## Purpose.UsedCar OtherDebtorsGuarantors.None
## 3.2526862 2.6003552
## CheckingAccountStatus.gt.200 CreditHistory.Critical
## 2.4870121 2.0602865
## SavingsAccountBonds.500.to.1000 Job.SkilledEmployee
## 1.9458261 1.1303419
## InstallmentRatePercentage EmploymentDuration.1.to.4
## 1.0546549 0.7533249
## EmploymentDuration.4.to.7 Property.RealEstate
## 0.7533249 0.4166667
## Purpose.Business Purpose.Furniture.Equipment
## 0.3763021 0.2605168
## Job.UnskilledResident
## 0.1388889