##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(mlbench)
library(pls)
library(corrplot)
library(lares)
library(DMwR)
library(DMwR2)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
library(nnet)
library(elasticnet)
library(earth)
library(party)
library(kernlab)
library(randomForest)
library(Cubist)
library(pROC)
library(mda)
library(klaR)
library(pamr)
library(ROSE)
library(themis)
##################################
# Loading source and
# formulating the train set
##################################
data(Sonar)
<- Sonar
Sonar.Original
<- Sonar[Sonar$Class=="M",]
Sonar.M <- Sonar[Sonar$Class=="R",]
Sonar.R set.seed(12345678)
<- Sonar.R[sample(1:nrow(Sonar.R),25),]
Sonar.R.Reduced
<- as.data.frame(rbind(Sonar.M,Sonar.R.Reduced))
Sonar
set.seed(12345678)
<- createDataPartition(Sonar$Class, p = .70, list = FALSE)
Sonar_Partition <- Sonar[Sonar_Partition,]
Sonar_Train <- Sonar[-Sonar_Partition,]
Sonar_Test
##################################
# Performing a general exploration of the train set
##################################
dim(Sonar_Train)
## [1] 96 61
str(Sonar_Train)
## 'data.frame': 96 obs. of 61 variables:
## $ V1 : num 0.0629 0.0587 0.0428 0.0599 0.0264 0.0454 0.0283 0.0114 0.0414 0.0228 ...
## $ V2 : num 0.1065 0.121 0.0555 0.0474 0.0071 ...
## $ V3 : num 0.1526 0.1268 0.0708 0.0498 0.0342 ...
## $ V4 : num 0.1229 0.1498 0.0618 0.0387 0.0793 ...
## $ V5 : num 0.144 0.144 0.121 0.103 0.104 ...
## $ V6 : num 0.119 0.0561 0.1524 0.0773 0.0783 ...
## $ V7 : num 0.0884 0.0832 0.1543 0.0853 0.1417 ...
## $ V8 : num 0.0907 0.0672 0.0391 0.0447 0.1176 ...
## $ V9 : num 0.2107 0.1372 0.061 0.1094 0.0453 ...
## $ V10 : num 0.3597 0.2352 0.0113 0.0351 0.0945 ...
## $ V11 : num 0.547 0.321 0.126 0.158 0.113 ...
## $ V12 : num 0.52 0.426 0.247 0.202 0.084 ...
## $ V13 : num 0.5127 0.5201 0.3011 0.2268 0.0717 ...
## $ V14 : num 0.539 0.491 0.375 0.283 0.197 ...
## $ V15 : num 0.656 0.595 0.452 0.382 0.263 ...
## $ V16 : num 0.871 0.722 0.539 0.467 0.419 ...
## $ V17 : num 0.979 0.904 0.659 0.669 0.505 ...
## $ V18 : num 0.933 0.911 0.711 0.865 0.671 ...
## $ V19 : num 0.792 0.872 0.76 0.936 0.792 ...
## $ V20 : num 0.738 0.769 0.867 0.937 0.838 ...
## $ V21 : num 0.691 0.733 0.842 0.914 0.876 ...
## $ V22 : num 0.385 0.522 0.797 0.916 0.942 ...
## $ V23 : num 0.0671 0.3097 0.8385 0.9311 1 ...
## $ V24 : num 0.0502 0.3172 0.9317 0.8604 0.9931 ...
## $ V25 : num 0.272 0.227 0.856 0.733 0.958 ...
## $ V26 : num 0.284 0.164 0.616 0.576 0.865 ...
## $ V27 : num 0.223 0.175 0.414 0.416 0.722 ...
## $ V28 : num 0.191 0.183 0.327 0.411 0.58 ...
## $ V29 : num 0.0408 0.2048 0.3108 0.4146 0.4964 ...
## $ V30 : num 0.253 0.167 0.255 0.315 0.489 ...
## $ V31 : num 0.198 0.277 0.337 0.294 0.408 ...
## $ V32 : num 0.189 0.31 0.447 0.317 0.244 ...
## $ V33 : num 0.243 0.34 0.5 0.315 0.177 ...
## $ V34 : num 0.196 0.444 0.511 0.413 0.247 ...
## $ V35 : num 0.267 0.505 0.519 0.399 0.352 ...
## $ V36 : num 0.134 0.281 0.462 0.419 0.376 ...
## $ V37 : num 0.107 0.168 0.423 0.453 0.291 ...
## $ V38 : num 0.202 0.263 0.437 0.442 0.231 ...
## $ V39 : num 0.179 0.32 0.428 0.474 0.317 ...
## $ V40 : num 0.0227 0.1933 0.4433 0.3431 0.3554 ...
## $ V41 : num 0.1313 0.0934 0.37 0.3194 0.3741 ...
## $ V42 : num 0.1775 0.0443 0.3324 0.337 0.4443 ...
## $ V43 : num 0.155 0.078 0.256 0.249 0.326 ...
## $ V44 : num 0.1626 0.0722 0.2527 0.265 0.1963 ...
## $ V45 : num 0.0708 0.0405 0.2137 0.1748 0.0864 ...
## $ V46 : num 0.0129 0.0553 0.1789 0.0932 0.1688 ...
## $ V47 : num 0.0795 0.1081 0.101 0.053 0.1991 ...
## $ V48 : num 0.0762 0.1139 0.0528 0.0081 0.1217 ...
## $ V49 : num 0.0117 0.0767 0.0453 0.0342 0.0628 0.038 0.0244 0.0728 0.0177 0.0649 ...
## $ V50 : num 0.0061 0.0265 0.0118 0.0137 0.0323 0.0142 0.0179 0.0174 0.0065 0.0313 ...
## $ V51 : num 0.0257 0.0215 0.0009 0.0028 0.0253 0.0137 0.0109 0.0213 0.0222 0.0185 ...
## $ V52 : num 0.0089 0.0331 0.0142 0.0013 0.0214 0.012 0.0147 0.0269 0.0045 0.0098 ...
## $ V53 : num 0.0262 0.0111 0.0179 0.0005 0.0262 0.0042 0.017 0.0152 0.0136 0.0178 ...
## $ V54 : num 0.0108 0.0088 0.0079 0.0227 0.0177 0.0238 0.0158 0.0257 0.0113 0.0077 ...
## $ V55 : num 0.0138 0.0158 0.006 0.0209 0.0037 0.0129 0.0046 0.0097 0.0053 0.0074 ...
## $ V56 : num 0.0187 0.0122 0.0131 0.0081 0.0068 0.0084 0.0073 0.0041 0.0165 0.0095 ...
## $ V57 : num 0.023 0.0038 0.0089 0.0117 0.0121 0.0218 0.0054 0.005 0.0141 0.0055 ...
## $ V58 : num 0.0057 0.0101 0.0084 0.0114 0.0077 0.0321 0.0033 0.0145 0.0077 0.0045 ...
## $ V59 : num 0.0113 0.0228 0.0113 0.0112 0.0078 0.0154 0.0045 0.0103 0.0246 0.0063 ...
## $ V60 : num 0.0131 0.0124 0.0049 0.01 0.0066 0.0053 0.0079 0.0025 0.0198 0.0039 ...
## $ Class: Factor w/ 2 levels "M","R": 1 1 1 1 1 1 1 1 1 1 ...
summary(Sonar_Train)
## V1 V2 V3 V4
## Min. :0.00150 Min. :0.00060 Min. :0.00150 Min. :0.00580
## 1st Qu.:0.01362 1st Qu.:0.01897 1st Qu.:0.02448 1st Qu.:0.02960
## Median :0.02320 Median :0.03385 Median :0.03880 Median :0.04905
## Mean :0.03171 Mean :0.04387 Mean :0.04718 Mean :0.05631
## 3rd Qu.:0.03982 3rd Qu.:0.05892 3rd Qu.:0.06212 3rd Qu.:0.07220
## Max. :0.13710 Max. :0.15740 Max. :0.16650 Max. :0.16440
## V5 V6 V7 V8
## Min. :0.00670 Min. :0.0102 Min. :0.0182 Min. :0.0124
## 1st Qu.:0.04530 1st Qu.:0.0782 1st Qu.:0.0937 1st Qu.:0.0950
## Median :0.07430 Median :0.1135 Median :0.1298 Median :0.1356
## Mean :0.08196 Mean :0.1178 Mean :0.1332 Mean :0.1511
## 3rd Qu.:0.10855 3rd Qu.:0.1496 3rd Qu.:0.1683 3rd Qu.:0.1906
## Max. :0.24820 Max. :0.3823 Max. :0.3729 Max. :0.4566
## V9 V10 V11 V12
## Min. :0.0075 Min. :0.0113 Min. :0.0526 Min. :0.0236
## 1st Qu.:0.1299 1st Qu.:0.1424 1st Qu.:0.1926 1st Qu.:0.1837
## Median :0.1815 Median :0.2124 Median :0.2515 Median :0.2781
## Mean :0.2039 Mean :0.2334 Mean :0.2662 Mean :0.2796
## 3rd Qu.:0.2596 3rd Qu.:0.2940 3rd Qu.:0.3335 3rd Qu.:0.3501
## Max. :0.6828 Max. :0.5965 Max. :0.6675 Max. :0.5679
## V13 V14 V15 V16
## Min. :0.0616 Min. :0.0273 Min. :0.0092 Min. :0.0422
## 1st Qu.:0.2122 1st Qu.:0.1855 1st Qu.:0.1673 1st Qu.:0.1911
## Median :0.2930 Median :0.2904 Median :0.2751 Median :0.3203
## Mean :0.3021 Mean :0.3139 Mean :0.3194 Mean :0.3753
## 3rd Qu.:0.3730 3rd Qu.:0.4051 3rd Qu.:0.4403 3rd Qu.:0.5332
## Max. :0.7131 Max. :0.9970 Max. :0.9137 Max. :0.9751
## V17 V18 V19 V20
## Min. :0.0367 Min. :0.0375 Min. :0.1316 Min. :0.0656
## 1st Qu.:0.2087 1st Qu.:0.2427 1st Qu.:0.2964 1st Qu.:0.3972
## Median :0.3160 Median :0.3730 Median :0.4462 Median :0.6223
## Mean :0.4137 Mean :0.4475 Mean :0.5134 Mean :0.5861
## 3rd Qu.:0.6466 3rd Qu.:0.6731 3rd Qu.:0.7310 3rd Qu.:0.7978
## Max. :1.0000 Max. :0.9335 Max. :0.9828 Max. :1.0000
## V21 V22 V23 V24
## Min. :0.0512 Min. :0.0219 Min. :0.0610 Min. :0.0502
## 1st Qu.:0.4412 1st Qu.:0.3991 1st Qu.:0.4533 1st Qu.:0.5795
## Median :0.6939 Median :0.7021 Median :0.7139 Median :0.6985
## Mean :0.6393 Mean :0.6364 Mean :0.6500 Mean :0.6795
## 3rd Qu.:0.8449 3rd Qu.:0.8498 3rd Qu.:0.8690 3rd Qu.:0.8968
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.0240 Min. :0.1640 Min. :0.1036 Min. :0.0598
## 1st Qu.:0.5690 1st Qu.:0.5637 1st Qu.:0.4955 1st Qu.:0.5582
## Median :0.7211 Median :0.7560 Median :0.7930 Median :0.7762
## Mean :0.6807 Mean :0.7079 Mean :0.7074 Mean :0.7076
## 3rd Qu.:0.8749 3rd Qu.:0.8766 3rd Qu.:0.9109 3rd Qu.:0.9116
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.0144 Min. :0.0613 Min. :0.1000 Min. :0.0877
## 1st Qu.:0.4667 1st Qu.:0.4140 1st Qu.:0.3268 1st Qu.:0.2771
## Median :0.7096 Median :0.6028 Median :0.4416 Median :0.4078
## Mean :0.6518 Mean :0.5869 Mean :0.4970 Mean :0.4364
## 3rd Qu.:0.8672 3rd Qu.:0.7189 3rd Qu.:0.6461 3rd Qu.:0.5816
## Max. :1.0000 Max. :1.0000 Max. :0.9657 Max. :0.9306
## V33 V34 V35 V36
## Min. :0.0477 Min. :0.0588 Min. :0.0223 Min. :0.0080
## 1st Qu.:0.2364 1st Qu.:0.2164 1st Qu.:0.1746 1st Qu.:0.1381
## Median :0.3875 Median :0.3644 Median :0.2930 Median :0.2808
## Mean :0.4079 Mean :0.3940 Mean :0.3772 Mean :0.3649
## 3rd Qu.:0.5409 3rd Qu.:0.5421 3rd Qu.:0.5775 3rd Qu.:0.5348
## Max. :1.0000 Max. :0.9536 Max. :0.9518 Max. :1.0000
## V37 V38 V39 V40
## Min. :0.0351 Min. :0.0618 Min. :0.0436 Min. :0.0227
## 1st Qu.:0.1447 1st Qu.:0.1747 1st Qu.:0.1827 1st Qu.:0.1962
## Median :0.2594 Median :0.3245 Median :0.3058 Median :0.2812
## Mean :0.3525 Mean :0.3458 Mean :0.3464 Mean :0.3173
## 3rd Qu.:0.4884 3rd Qu.:0.4405 3rd Qu.:0.4801 3rd Qu.:0.4269
## Max. :0.9123 Max. :0.9480 Max. :0.9709 Max. :0.9297
## V41 V42 V43 V44
## Min. :0.0438 Min. :0.0443 Min. :0.0308 Min. :0.0255
## 1st Qu.:0.1696 1st Qu.:0.1688 1st Qu.:0.1611 1st Qu.:0.1386
## Median :0.2658 Median :0.2808 Median :0.2580 Median :0.1916
## Mean :0.3043 Mean :0.3053 Mean :0.2722 Mean :0.2370
## 3rd Qu.:0.4094 3rd Qu.:0.3973 3rd Qu.:0.3471 3rd Qu.:0.3081
## Max. :0.8995 Max. :0.8246 Max. :0.7517 Max. :0.5772
## V45 V46 V47 V48
## Min. :0.0352 Min. :0.0080 Min. :0.01790 Min. :0.0081
## 1st Qu.:0.1105 1st Qu.:0.0846 1st Qu.:0.07727 1st Qu.:0.0531
## Median :0.1736 Median :0.1445 Median :0.10900 Median :0.0935
## Mean :0.2362 Mean :0.1930 Mean :0.14301 Mean :0.1088
## 3rd Qu.:0.3626 3rd Qu.:0.2283 3rd Qu.:0.18247 3rd Qu.:0.1351
## Max. :0.7034 Max. :0.7292 Max. :0.55220 Max. :0.3339
## V49 V50 V51 V52
## Min. :0.00730 Min. :0.00440 Min. :0.00090 Min. :0.00130
## 1st Qu.:0.03322 1st Qu.:0.01310 1st Qu.:0.01040 1st Qu.:0.00875
## Median :0.05445 Median :0.01920 Median :0.01565 Median :0.01215
## Mean :0.06444 Mean :0.02375 Mean :0.01878 Mean :0.01516
## 3rd Qu.:0.09137 3rd Qu.:0.02902 3rd Qu.:0.02363 3rd Qu.:0.01830
## Max. :0.19810 Max. :0.08250 Max. :0.10040 Max. :0.07090
## V53 V54 V55 V56
## Min. :0.000500 Min. :0.001000 Min. :0.001100 Min. :0.000400
## 1st Qu.:0.004975 1st Qu.:0.005375 1st Qu.:0.003700 1st Qu.:0.004350
## Median :0.007900 Median :0.009700 Median :0.007700 Median :0.007050
## Mean :0.010800 Mean :0.011600 Mean :0.009373 Mean :0.008372
## 3rd Qu.:0.015375 3rd Qu.:0.015050 3rd Qu.:0.012625 3rd Qu.:0.011625
## Max. :0.036100 Max. :0.035200 Max. :0.044700 Max. :0.039400
## V57 V58 V59 V60
## Min. :0.001100 Min. :0.000900 Min. :0.000100 Min. :0.000600
## 1st Qu.:0.003700 1st Qu.:0.003600 1st Qu.:0.003550 1st Qu.:0.003100
## Median :0.005750 Median :0.006300 Median :0.007000 Median :0.005100
## Mean :0.007678 Mean :0.008472 Mean :0.008259 Mean :0.006066
## 3rd Qu.:0.010725 3rd Qu.:0.010275 3rd Qu.:0.010750 3rd Qu.:0.008125
## Max. :0.035500 Max. :0.044000 Max. :0.029400 Max. :0.021800
## Class
## M:78
## R:18
##
##
##
##
##################################
# Performing a general exploration of the test set
##################################
dim(Sonar_Test)
## [1] 40 61
str(Sonar_Test)
## 'data.frame': 40 obs. of 61 variables:
## $ V1 : num 0.0491 0.1313 0.0201 0.0335 0.0162 ...
## $ V2 : num 0.0279 0.2339 0.0423 0.0134 0.0253 ...
## $ V3 : num 0.0592 0.3059 0.0554 0.0696 0.0262 ...
## $ V4 : num 0.127 0.4264 0.0783 0.118 0.0386 ...
## $ V5 : num 0.1772 0.401 0.062 0.0348 0.0645 ...
## $ V6 : num 0.1908 0.1791 0.0871 0.118 0.0472 ...
## $ V7 : num 0.222 0.185 0.12 0.195 0.106 ...
## $ V8 : num 0.0768 0.0055 0.2707 0.1607 0.1388 ...
## $ V9 : num 0.1246 0.1929 0.1206 0.3036 0.0598 ...
## $ V10 : num 0.2028 0.2231 0.0279 0.4372 0.1334 ...
## $ V11 : num 0.0947 0.2907 0.2251 0.5533 0.2969 ...
## $ V12 : num 0.25 0.226 0.262 0.577 0.475 ...
## $ V13 : num 0.221 0.314 0.177 0.702 0.568 ...
## $ V14 : num 0.32 0.33 0.371 0.707 0.569 ...
## $ V15 : num 0.334 0.366 0.453 0.737 0.642 ...
## $ V16 : num 0.332 0.396 0.555 0.739 0.749 ...
## $ V17 : num 0.278 0.439 0.462 0.862 0.9 ...
## $ V18 : num 0.297 0.467 0.38 0.946 1 ...
## $ V19 : num 0.295 0.525 0.345 0.878 0.969 ...
## $ V20 : num 0.173 0.373 0.267 0.791 0.903 ...
## $ V21 : num 0.326 0.224 0.239 0.576 0.768 ...
## $ V22 : num 0.383 0.197 0.113 0.306 0.7 ...
## $ V23 : num 0.3523 0.4337 0.2556 0.0563 0.6644 ...
## $ V24 : num 0.541 0.6532 0.5169 0.0239 0.5964 ...
## $ V25 : num 0.523 0.507 0.378 0.255 0.371 ...
## $ V26 : num 0.4475 0.2796 0.4082 0.4862 0.0921 ...
## $ V27 : num 0.534 0.4163 0.5353 0.5027 0.0481 ...
## $ V28 : num 0.5323 0.595 0.5116 0.4402 0.0876 ...
## $ V29 : num 0.391 0.524 0.454 0.285 0.104 ...
## $ V30 : num 0.346 0.418 0.426 0.18 0.171 ...
## $ V31 : num 0.409 0.371 0.387 0.356 0.326 ...
## $ V32 : num 0.464 0.237 0.394 0.352 0.461 ...
## $ V33 : num 0.558 0.0863 0.4661 0.3321 0.3939 ...
## $ V34 : num 0.573 0.144 0.397 0.311 0.505 ...
## $ V35 : num 0.635 0.29 0.219 0.364 0.483 ...
## $ V36 : num 0.7563 0.4577 0.1816 0.0754 0.3511 ...
## $ V37 : num 0.69 0.372 0.102 0.183 0.232 ...
## $ V38 : num 0.618 0.337 0.211 0.182 0.403 ...
## $ V39 : num 0.538 0.38 0.325 0.181 0.368 ...
## $ V40 : num 0.562 0.418 0.37 0.159 0.151 ...
## $ V41 : num 0.6508 0.3603 0.2912 0.0576 0.0745 ...
## $ V42 : num 0.4797 0.2711 0.301 0.0954 0.1395 ...
## $ V43 : num 0.374 0.165 0.256 0.109 0.155 ...
## $ V44 : num 0.2804 0.1951 0.1927 0.0812 0.0377 ...
## $ V45 : num 0.1982 0.2811 0.2062 0.0784 0.0636 ...
## $ V46 : num 0.2438 0.2246 0.1751 0.0487 0.0443 ...
## $ V47 : num 0.1789 0.1921 0.0841 0.0439 0.0264 ...
## $ V48 : num 0.1706 0.15 0.1035 0.0586 0.0223 ...
## $ V49 : num 0.0762 0.0665 0.0641 0.037 0.0187 0.0245 0.0102 0.0436 0.0293 0.0469 ...
## $ V50 : num 0.0238 0.0193 0.0153 0.0185 0.0077 0.019 0.0057 0.0224 0.0183 0.0114 ...
## $ V51 : num 0.0268 0.0156 0.0081 0.0302 0.0137 0.0063 0.0031 0.0133 0.0104 0.0299 ...
## $ V52 : num 0.0081 0.0362 0.0191 0.0244 0.0071 0.0321 0.0163 0.0078 0.0117 0.0244 ...
## $ V53 : num 0.0129 0.021 0.0182 0.0232 0.0082 0.0189 0.0099 0.0174 0.0101 0.0199 ...
## $ V54 : num 0.0161 0.0154 0.016 0.0093 0.0232 0.0137 0.0084 0.0176 0.0061 0.0257 ...
## $ V55 : num 0.0063 0.018 0.029 0.0159 0.0198 0.0277 0.027 0.0038 0.0031 0.0082 ...
## $ V56 : num 0.0119 0.0013 0.009 0.0193 0.0074 0.0152 0.0277 0.0129 0.0099 0.0151 ...
## $ V57 : num 0.0194 0.0106 0.0242 0.0032 0.0035 0.0052 0.0097 0.0066 0.008 0.0171 ...
## $ V58 : num 0.014 0.0127 0.0224 0.0377 0.01 0.0121 0.0054 0.0044 0.0107 0.0146 ...
## $ V59 : num 0.0332 0.0178 0.019 0.0126 0.0048 0.0124 0.0148 0.0134 0.0161 0.0134 ...
## $ V60 : num 0.0439 0.0231 0.0096 0.0156 0.0019 0.0055 0.0092 0.0092 0.0133 0.0056 ...
## $ Class: Factor w/ 2 levels "M","R": 1 1 1 1 1 1 1 1 1 1 ...
summary(Sonar_Test)
## V1 V2 V3 V4
## Min. :0.00470 Min. :0.00220 Min. :0.00450 Min. :0.00760
## 1st Qu.:0.01620 1st Qu.:0.01392 1st Qu.:0.01770 1st Qu.:0.02615
## Median :0.02495 Median :0.03190 Median :0.03660 Median :0.04465
## Mean :0.03229 Mean :0.03954 Mean :0.04819 Mean :0.07107
## 3rd Qu.:0.03665 3rd Qu.:0.04850 3rd Qu.:0.05635 3rd Qu.:0.08830
## Max. :0.13130 Max. :0.23390 Max. :0.30590 Max. :0.42640
## V5 V6 V7 V8
## Min. :0.00970 Min. :0.02260 Min. :0.00330 Min. :0.00550
## 1st Qu.:0.03470 1st Qu.:0.05325 1st Qu.:0.06792 1st Qu.:0.08903
## Median :0.06155 Median :0.07610 Median :0.09480 Median :0.11180
## Mean :0.08154 Mean :0.08995 Mean :0.11237 Mean :0.12967
## 3rd Qu.:0.08470 3rd Qu.:0.11365 3rd Qu.:0.14510 3rd Qu.:0.15188
## Max. :0.40100 Max. :0.22470 Max. :0.33220 Max. :0.45900
## V9 V10 V11 V12
## Min. :0.0494 Min. :0.0193 Min. :0.0523 Min. :0.0259
## 1st Qu.:0.1000 1st Qu.:0.1261 1st Qu.:0.1572 1st Qu.:0.2245
## Median :0.1439 Median :0.1813 Median :0.2363 Median :0.2599
## Mean :0.1795 Mean :0.2212 Mean :0.2595 Mean :0.2809
## 3rd Qu.:0.2196 3rd Qu.:0.2596 3rd Qu.:0.2991 3rd Qu.:0.3141
## Max. :0.5664 Max. :0.7106 Max. :0.7342 Max. :0.5771
## V13 V14 V15 V16
## Min. :0.1184 Min. :0.0336 Min. :0.0166 Min. :0.0572
## 1st Qu.:0.2081 1st Qu.:0.2122 1st Qu.:0.1990 1st Qu.:0.2072
## Median :0.2581 Median :0.2959 Median :0.3125 Median :0.3199
## Mean :0.2880 Mean :0.3048 Mean :0.3301 Mean :0.3778
## 3rd Qu.:0.3155 3rd Qu.:0.3464 3rd Qu.:0.4298 3rd Qu.:0.5161
## Max. :0.7022 Max. :0.7067 Max. :0.7367 Max. :0.8278
## V17 V18 V19 V20
## Min. :0.1162 Min. :0.0837 Min. :0.1151 Min. :0.0902
## 1st Qu.:0.2159 1st Qu.:0.2492 1st Qu.:0.3366 1st Qu.:0.3652
## Median :0.3154 Median :0.3607 Median :0.5134 Median :0.6252
## Mean :0.4086 Mean :0.4693 Mean :0.5419 Mean :0.5995
## 3rd Qu.:0.6000 3rd Qu.:0.6776 3rd Qu.:0.8178 3rd Qu.:0.8684
## Max. :0.8999 Max. :1.0000 Max. :0.9975 Max. :0.9911
## V21 V22 V23 V24
## Min. :0.1354 Min. :0.1127 Min. :0.0563 Min. :0.0239
## 1st Qu.:0.4244 1st Qu.:0.4482 1st Qu.:0.5467 1st Qu.:0.5782
## Median :0.7064 Median :0.7190 Median :0.7579 Median :0.7542
## Mean :0.6382 Mean :0.6577 Mean :0.6836 Mean :0.7058
## 3rd Qu.:0.8115 3rd Qu.:0.8320 3rd Qu.:0.8524 3rd Qu.:0.8771
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V25 V26 V27 V28
## Min. :0.1934 Min. :0.0921 Min. :0.0481 Min. :0.0284
## 1st Qu.:0.5189 1st Qu.:0.4807 1st Qu.:0.4598 1st Qu.:0.5109
## Median :0.7201 Median :0.7925 Median :0.7719 Median :0.7435
## Mean :0.6937 Mean :0.6907 Mean :0.6910 Mean :0.6893
## 3rd Qu.:0.9090 3rd Qu.:0.9534 3rd Qu.:0.9674 3rd Qu.:0.9476
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## V29 V30 V31 V32
## Min. :0.1008 Min. :0.1714 Min. :0.0482 Min. :0.0404
## 1st Qu.:0.4543 1st Qu.:0.4012 1st Qu.:0.3486 1st Qu.:0.3132
## Median :0.6583 Median :0.6019 Median :0.4360 Median :0.4179
## Mean :0.6327 Mean :0.5438 Mean :0.4626 Mean :0.4103
## 3rd Qu.:0.8402 3rd Qu.:0.6990 3rd Qu.:0.5930 3rd Qu.:0.4918
## Max. :1.0000 Max. :0.9151 Max. :0.8828 Max. :0.9108
## V33 V34 V35 V36
## Min. :0.0637 Min. :0.0212 Min. :0.0619 Min. :0.0271
## 1st Qu.:0.2634 1st Qu.:0.2005 1st Qu.:0.1475 1st Qu.:0.1501
## Median :0.3797 Median :0.3052 Median :0.2669 Median :0.2350
## Mean :0.3832 Mean :0.3476 Mean :0.3285 Mean :0.3095
## 3rd Qu.:0.5090 3rd Qu.:0.4620 3rd Qu.:0.4560 3rd Qu.:0.4424
## Max. :0.7927 Max. :0.8703 Max. :1.0000 Max. :0.9212
## V37 V38 V39 V40
## Min. :0.0476 Min. :0.0411 Min. :0.0712 Min. :0.0325
## 1st Qu.:0.1535 1st Qu.:0.1741 1st Qu.:0.1754 1st Qu.:0.1572
## Median :0.2416 Median :0.3095 Median :0.3251 Median :0.2807
## Mean :0.2919 Mean :0.3190 Mean :0.3071 Mean :0.2859
## 3rd Qu.:0.4083 3rd Qu.:0.4115 3rd Qu.:0.3901 3rd Qu.:0.4062
## Max. :0.9386 Max. :0.9303 Max. :0.7601 Max. :0.6034
## V41 V42 V43 V44
## Min. :0.0360 Min. :0.0300 Min. :0.0537 Min. :0.0255
## 1st Qu.:0.1157 1st Qu.:0.1473 1st Qu.:0.1704 1st Qu.:0.1412
## Median :0.2497 Median :0.2228 Median :0.2265 Median :0.1953
## Mean :0.2644 Mean :0.2729 Mean :0.2534 Mean :0.2204
## 3rd Qu.:0.3752 3rd Qu.:0.4326 3rd Qu.:0.3649 3rd Qu.:0.2792
## Max. :0.6508 Max. :0.6443 Max. :0.4478 Max. :0.5245
## V45 V46 V47 V48
## Min. :0.0298 Min. :0.01380 Min. :0.0237 Min. :0.00410
## 1st Qu.:0.0908 1st Qu.:0.07405 1st Qu.:0.0744 1st Qu.:0.04977
## Median :0.1463 Median :0.12550 Median :0.1134 Median :0.08030
## Mean :0.1969 Mean :0.15892 Mean :0.1220 Mean :0.08778
## 3rd Qu.:0.2072 3rd Qu.:0.20820 3rd Qu.:0.1572 3rd Qu.:0.12095
## Max. :0.6149 Max. :0.52930 Max. :0.3385 Max. :0.20520
## V49 V50 V51 V52
## Min. :0.01020 Min. :0.00500 Min. :0.00260 Min. :0.00400
## 1st Qu.:0.02652 1st Qu.:0.01155 1st Qu.:0.01093 1st Qu.:0.00945
## Median :0.04525 Median :0.01875 Median :0.01550 Median :0.01340
## Mean :0.04845 Mean :0.01904 Mean :0.01648 Mean :0.01561
## 3rd Qu.:0.06732 3rd Qu.:0.02312 3rd Qu.:0.02050 3rd Qu.:0.01770
## Max. :0.10690 Max. :0.06370 Max. :0.03800 Max. :0.04590
## V53 V54 V55 V56
## Min. :0.001500 Min. :0.00180 Min. :0.001300 Min. :0.00130
## 1st Qu.:0.009125 1st Qu.:0.00605 1st Qu.:0.003875 1st Qu.:0.00450
## Median :0.012000 Median :0.00905 Median :0.006250 Median :0.00700
## Mean :0.012740 Mean :0.01205 Mean :0.010433 Mean :0.00858
## 3rd Qu.:0.015675 3rd Qu.:0.01638 3rd Qu.:0.014550 3rd Qu.:0.01063
## Max. :0.039000 Max. :0.03350 Max. :0.037600 Max. :0.02770
## V57 V58 V59 V60
## Min. :0.000900 Min. :0.000600 Min. :0.000200 Min. :0.00150
## 1st Qu.:0.003425 1st Qu.:0.003600 1st Qu.:0.003575 1st Qu.:0.00310
## Median :0.005800 Median :0.005800 Median :0.006000 Median :0.00570
## Mean :0.007403 Mean :0.008155 Mean :0.009057 Mean :0.00817
## 3rd Qu.:0.009025 3rd Qu.:0.011650 3rd Qu.:0.012450 3rd Qu.:0.01020
## Max. :0.024200 Max. :0.037700 Max. :0.036400 Max. :0.04390
## Class
## M:33
## R: 7
##
##
##
##
##################################
# Formulating a data type assessment summary
##################################
<- Sonar_Train
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 V1 numeric
## 2 2 V2 numeric
## 3 3 V3 numeric
## 4 4 V4 numeric
## 5 5 V5 numeric
## 6 6 V6 numeric
## 7 7 V7 numeric
## 8 8 V8 numeric
## 9 9 V9 numeric
## 10 10 V10 numeric
## 11 11 V11 numeric
## 12 12 V12 numeric
## 13 13 V13 numeric
## 14 14 V14 numeric
## 15 15 V15 numeric
## 16 16 V16 numeric
## 17 17 V17 numeric
## 18 18 V18 numeric
## 19 19 V19 numeric
## 20 20 V20 numeric
## 21 21 V21 numeric
## 22 22 V22 numeric
## 23 23 V23 numeric
## 24 24 V24 numeric
## 25 25 V25 numeric
## 26 26 V26 numeric
## 27 27 V27 numeric
## 28 28 V28 numeric
## 29 29 V29 numeric
## 30 30 V30 numeric
## 31 31 V31 numeric
## 32 32 V32 numeric
## 33 33 V33 numeric
## 34 34 V34 numeric
## 35 35 V35 numeric
## 36 36 V36 numeric
## 37 37 V37 numeric
## 38 38 V38 numeric
## 39 39 V39 numeric
## 40 40 V40 numeric
## 41 41 V41 numeric
## 42 42 V42 numeric
## 43 43 V43 numeric
## 44 44 V44 numeric
## 45 45 V45 numeric
## 46 46 V46 numeric
## 47 47 V47 numeric
## 48 48 V48 numeric
## 49 49 V49 numeric
## 50 50 V50 numeric
## 51 51 V51 numeric
## 52 52 V52 numeric
## 53 53 V53 numeric
## 54 54 V54 numeric
## 55 55 V55 numeric
## 56 56 V56 numeric
## 57 57 V57 numeric
## 58 58 V58 numeric
## 59 59 V59 numeric
## 60 60 V60 numeric
## 61 61 Class factor
##################################
# Loading dataset
##################################
<- Sonar_Train
DQA
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 V1 numeric 96 0 1.000
## 2 2 V2 numeric 96 0 1.000
## 3 3 V3 numeric 96 0 1.000
## 4 4 V4 numeric 96 0 1.000
## 5 5 V5 numeric 96 0 1.000
## 6 6 V6 numeric 96 0 1.000
## 7 7 V7 numeric 96 0 1.000
## 8 8 V8 numeric 96 0 1.000
## 9 9 V9 numeric 96 0 1.000
## 10 10 V10 numeric 96 0 1.000
## 11 11 V11 numeric 96 0 1.000
## 12 12 V12 numeric 96 0 1.000
## 13 13 V13 numeric 96 0 1.000
## 14 14 V14 numeric 96 0 1.000
## 15 15 V15 numeric 96 0 1.000
## 16 16 V16 numeric 96 0 1.000
## 17 17 V17 numeric 96 0 1.000
## 18 18 V18 numeric 96 0 1.000
## 19 19 V19 numeric 96 0 1.000
## 20 20 V20 numeric 96 0 1.000
## 21 21 V21 numeric 96 0 1.000
## 22 22 V22 numeric 96 0 1.000
## 23 23 V23 numeric 96 0 1.000
## 24 24 V24 numeric 96 0 1.000
## 25 25 V25 numeric 96 0 1.000
## 26 26 V26 numeric 96 0 1.000
## 27 27 V27 numeric 96 0 1.000
## 28 28 V28 numeric 96 0 1.000
## 29 29 V29 numeric 96 0 1.000
## 30 30 V30 numeric 96 0 1.000
## 31 31 V31 numeric 96 0 1.000
## 32 32 V32 numeric 96 0 1.000
## 33 33 V33 numeric 96 0 1.000
## 34 34 V34 numeric 96 0 1.000
## 35 35 V35 numeric 96 0 1.000
## 36 36 V36 numeric 96 0 1.000
## 37 37 V37 numeric 96 0 1.000
## 38 38 V38 numeric 96 0 1.000
## 39 39 V39 numeric 96 0 1.000
## 40 40 V40 numeric 96 0 1.000
## 41 41 V41 numeric 96 0 1.000
## 42 42 V42 numeric 96 0 1.000
## 43 43 V43 numeric 96 0 1.000
## 44 44 V44 numeric 96 0 1.000
## 45 45 V45 numeric 96 0 1.000
## 46 46 V46 numeric 96 0 1.000
## 47 47 V47 numeric 96 0 1.000
## 48 48 V48 numeric 96 0 1.000
## 49 49 V49 numeric 96 0 1.000
## 50 50 V50 numeric 96 0 1.000
## 51 51 V51 numeric 96 0 1.000
## 52 52 V52 numeric 96 0 1.000
## 53 53 V53 numeric 96 0 1.000
## 54 54 V54 numeric 96 0 1.000
## 55 55 V55 numeric 96 0 1.000
## 56 56 V56 numeric 96 0 1.000
## 57 57 V57 numeric 96 0 1.000
## 58 58 V58 numeric 96 0 1.000
## 59 59 V59 numeric 96 0 1.000
## 60 60 V60 numeric 96 0 1.000
## 61 61 Class factor 96 0 1.000
##################################
# Listing all predictors
##################################
<- DQA[,!names(DQA) %in% c("Class")]
DQA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 60 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 V1 numeric 91 0.948 0.021
## 2 V2 numeric 93 0.969 0.019
## 3 V3 numeric 92 0.958 0.030
## 4 V4 numeric 90 0.938 0.061
## 5 V5 numeric 93 0.969 0.112
## 6 V6 numeric 94 0.979 0.152
## 7 V7 numeric 94 0.979 0.149
## 8 V8 numeric 95 0.990 0.168
## 9 V9 numeric 96 1.000 0.211
## 10 V10 numeric 96 1.000 0.360
## 11 V11 numeric 94 0.979 0.213
## 12 V12 numeric 96 1.000 0.520
## 13 V13 numeric 95 0.990 0.286
## 14 V14 numeric 93 0.969 0.290
## 15 V15 numeric 93 0.969 0.377
## 16 V16 numeric 96 1.000 0.871
## 17 V17 numeric 94 0.979 1.000
## 18 V18 numeric 95 0.990 0.243
## 19 V19 numeric 96 1.000 0.792
## 20 V20 numeric 93 0.969 0.769
## 21 V21 numeric 93 0.969 1.000
## 22 V22 numeric 94 0.979 1.000
## 23 V23 numeric 94 0.979 1.000
## 24 V24 numeric 94 0.979 1.000
## 25 V25 numeric 93 0.969 1.000
## 26 V26 numeric 91 0.948 1.000
## 27 V27 numeric 90 0.938 1.000
## 28 V28 numeric 88 0.917 1.000
## 29 V29 numeric 93 0.969 1.000
## 30 V30 numeric 93 0.969 1.000
## 31 V31 numeric 95 0.990 0.386
## 32 V32 numeric 96 1.000 0.189
## 33 V33 numeric 94 0.979 0.525
## 34 V34 numeric 96 1.000 0.196
## 35 V35 numeric 96 1.000 0.267
## 36 V36 numeric 95 0.990 0.233
## 37 V37 numeric 96 1.000 0.107
## 38 V38 numeric 96 1.000 0.202
## 39 V39 numeric 95 0.990 0.089
## 40 V40 numeric 95 0.990 0.443
## 41 V41 numeric 96 1.000 0.131
## 42 V42 numeric 96 1.000 0.178
## 43 V43 numeric 96 1.000 0.155
## 44 V44 numeric 95 0.990 0.192
## 45 V45 numeric 96 1.000 0.071
## 46 V46 numeric 95 0.990 0.096
## 47 V47 numeric 94 0.979 0.080
## 48 V48 numeric 96 1.000 0.076
## 49 V49 numeric 95 0.990 0.108
## 50 V50 numeric 83 0.865 0.018
## 51 V51 numeric 83 0.865 0.014
## 52 V52 numeric 83 0.865 0.009
## 53 V53 numeric 78 0.812 0.018
## 54 V54 numeric 79 0.823 0.011
## 55 V55 numeric 75 0.781 0.008
## 56 V56 numeric 79 0.823 0.003
## 57 V57 numeric 72 0.750 0.005
## 58 V58 numeric 71 0.740 0.010
## 59 V59 numeric 70 0.729 0.008
## 60 V60 numeric 70 0.729 0.003
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 0.023 3 2 1.500
## 2 0.106 2 1 2.000
## 3 0.153 2 1 2.000
## 4 0.039 3 2 1.500
## 5 0.144 2 1 2.000
## 6 0.119 2 1 2.000
## 7 0.088 2 1 2.000
## 8 0.091 2 1 2.000
## 9 0.000 1 0 Inf
## 10 0.000 1 0 Inf
## 11 0.547 2 1 2.000
## 12 0.000 1 0 Inf
## 13 0.513 2 1 2.000
## 14 0.539 2 1 2.000
## 15 0.656 2 1 2.000
## 16 0.000 1 0 Inf
## 17 0.979 2 1 2.000
## 18 0.933 2 1 2.000
## 19 0.000 1 0 Inf
## 20 0.738 2 1 2.000
## 21 0.691 4 1 4.000
## 22 0.385 3 1 3.000
## 23 0.067 3 1 3.000
## 24 0.050 3 1 3.000
## 25 0.272 4 1 4.000
## 26 0.284 6 1 6.000
## 27 0.892 5 2 2.500
## 28 0.191 9 1 9.000
## 29 0.904 3 2 1.500
## 30 0.253 4 1 4.000
## 31 0.198 2 1 2.000
## 32 0.000 1 0 Inf
## 33 0.243 2 1 2.000
## 34 0.000 1 0 Inf
## 35 0.000 1 0 Inf
## 36 0.134 2 1 2.000
## 37 0.000 1 0 Inf
## 38 0.000 1 0 Inf
## 39 0.179 2 1 2.000
## 40 0.023 2 1 2.000
## 41 0.000 1 0 Inf
## 42 0.000 1 0 Inf
## 43 0.000 1 0 Inf
## 44 0.163 2 1 2.000
## 45 0.000 1 0 Inf
## 46 0.013 2 1 2.000
## 47 0.080 2 1 2.000
## 48 0.000 1 0 Inf
## 49 0.012 2 1 2.000
## 50 0.026 3 2 1.500
## 51 0.025 3 2 1.500
## 52 0.009 3 2 1.500
## 53 0.026 3 2 1.500
## 54 0.008 3 2 1.500
## 55 0.004 4 3 1.333
## 56 0.008 3 2 1.500
## 57 0.004 4 3 1.333
## 58 0.006 3 2 1.500
## 59 0.007 4 3 1.333
## 60 0.002 4 3 1.333
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 1 0.002 0.032 0.023 0.137 1.792 6.412 0.014 0.040
## 2 0.001 0.044 0.034 0.157 1.257 4.025 0.019 0.059
## 3 0.002 0.047 0.039 0.166 1.468 5.010 0.024 0.062
## 4 0.006 0.056 0.049 0.164 1.121 3.944 0.030 0.072
## 5 0.007 0.082 0.074 0.248 0.841 3.808 0.045 0.109
## 6 0.010 0.118 0.113 0.382 1.173 5.889 0.078 0.150
## 7 0.018 0.133 0.130 0.373 0.754 4.295 0.094 0.168
## 8 0.012 0.151 0.136 0.457 1.233 4.917 0.095 0.191
## 9 0.007 0.204 0.182 0.683 1.542 6.658 0.130 0.260
## 10 0.011 0.233 0.212 0.597 0.912 3.670 0.142 0.294
## 11 0.053 0.266 0.252 0.667 0.727 3.824 0.193 0.334
## 12 0.024 0.280 0.278 0.568 0.325 2.699 0.184 0.350
## 13 0.062 0.302 0.293 0.713 0.544 3.116 0.212 0.373
## 14 0.027 0.314 0.290 0.997 1.098 4.726 0.185 0.405
## 15 0.009 0.319 0.275 0.914 0.891 3.188 0.167 0.440
## 16 0.042 0.375 0.320 0.975 0.810 2.804 0.191 0.533
## 17 0.037 0.414 0.316 1.000 0.678 2.281 0.209 0.647
## 18 0.038 0.447 0.373 0.933 0.480 1.909 0.243 0.673
## 19 0.132 0.513 0.446 0.983 0.265 1.756 0.296 0.731
## 20 0.066 0.586 0.622 1.000 -0.329 1.943 0.397 0.798
## 21 0.051 0.639 0.694 1.000 -0.553 2.254 0.441 0.845
## 22 0.022 0.636 0.702 1.000 -0.522 2.078 0.399 0.850
## 23 0.061 0.650 0.714 1.000 -0.642 2.271 0.453 0.869
## 24 0.050 0.680 0.699 1.000 -0.741 2.742 0.580 0.897
## 25 0.024 0.681 0.721 1.000 -0.824 3.019 0.569 0.875
## 26 0.164 0.708 0.756 1.000 -0.686 2.618 0.564 0.877
## 27 0.104 0.707 0.793 1.000 -0.682 2.297 0.496 0.911
## 28 0.060 0.708 0.776 1.000 -0.681 2.421 0.558 0.912
## 29 0.014 0.652 0.710 1.000 -0.601 2.411 0.467 0.867
## 30 0.061 0.587 0.603 1.000 -0.025 2.431 0.414 0.719
## 31 0.100 0.497 0.442 0.966 0.465 2.307 0.327 0.646
## 32 0.088 0.436 0.408 0.931 0.460 2.307 0.277 0.582
## 33 0.048 0.408 0.387 1.000 0.470 2.647 0.236 0.541
## 34 0.059 0.394 0.364 0.954 0.588 2.552 0.216 0.542
## 35 0.022 0.377 0.293 0.952 0.570 2.141 0.175 0.578
## 36 0.008 0.365 0.281 1.000 0.794 2.447 0.138 0.535
## 37 0.035 0.352 0.259 0.912 0.783 2.269 0.145 0.488
## 38 0.062 0.346 0.324 0.948 0.995 3.280 0.175 0.441
## 39 0.044 0.346 0.306 0.971 0.796 3.067 0.183 0.480
## 40 0.023 0.317 0.281 0.930 0.806 3.719 0.196 0.427
## 41 0.044 0.304 0.266 0.899 0.879 3.612 0.170 0.409
## 42 0.044 0.305 0.281 0.825 0.869 3.538 0.169 0.397
## 43 0.031 0.272 0.258 0.752 0.658 3.105 0.161 0.347
## 44 0.025 0.237 0.192 0.577 0.861 2.674 0.139 0.308
## 45 0.035 0.236 0.174 0.703 0.958 2.730 0.111 0.363
## 46 0.008 0.193 0.145 0.729 1.458 4.505 0.085 0.228
## 47 0.018 0.143 0.109 0.552 1.636 5.675 0.077 0.182
## 48 0.008 0.109 0.094 0.334 1.102 3.696 0.053 0.135
## 49 0.007 0.064 0.054 0.198 0.974 3.513 0.033 0.091
## 50 0.004 0.024 0.019 0.082 1.591 5.793 0.013 0.029
## 51 0.001 0.019 0.016 0.100 2.728 14.443 0.010 0.024
## 52 0.001 0.015 0.012 0.071 2.229 10.249 0.009 0.018
## 53 0.000 0.011 0.008 0.036 1.024 3.674 0.005 0.015
## 54 0.001 0.012 0.010 0.035 0.991 3.390 0.005 0.015
## 55 0.001 0.009 0.008 0.045 1.958 8.464 0.004 0.013
## 56 0.000 0.008 0.007 0.039 2.124 10.327 0.004 0.012
## 57 0.001 0.008 0.006 0.035 1.859 8.338 0.004 0.011
## 58 0.001 0.008 0.006 0.044 2.152 9.133 0.004 0.010
## 59 0.000 0.008 0.007 0.029 1.280 4.224 0.004 0.011
## 60 0.001 0.006 0.005 0.022 1.381 5.177 0.003 0.008
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 17 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 9 V9 numeric 96 1.000 0.211
## 10 V10 numeric 96 1.000 0.360
## 12 V12 numeric 96 1.000 0.520
## 16 V16 numeric 96 1.000 0.871
## 19 V19 numeric 96 1.000 0.792
## 26 V26 numeric 91 0.948 1.000
## 28 V28 numeric 88 0.917 1.000
## 32 V32 numeric 96 1.000 0.189
## 34 V34 numeric 96 1.000 0.196
## 35 V35 numeric 96 1.000 0.267
## 37 V37 numeric 96 1.000 0.107
## 38 V38 numeric 96 1.000 0.202
## 41 V41 numeric 96 1.000 0.131
## 42 V42 numeric 96 1.000 0.178
## 43 V43 numeric 96 1.000 0.155
## 45 V45 numeric 96 1.000 0.071
## 48 V48 numeric 96 1.000 0.076
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 9 0.000 1 0 Inf
## 10 0.000 1 0 Inf
## 12 0.000 1 0 Inf
## 16 0.000 1 0 Inf
## 19 0.000 1 0 Inf
## 26 0.284 6 1 6.000
## 28 0.191 9 1 9.000
## 32 0.000 1 0 Inf
## 34 0.000 1 0 Inf
## 35 0.000 1 0 Inf
## 37 0.000 1 0 Inf
## 38 0.000 1 0 Inf
## 41 0.000 1 0 Inf
## 42 0.000 1 0 Inf
## 43 0.000 1 0 Inf
## 45 0.000 1 0 Inf
## 48 0.000 1 0 Inf
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 9 0.007 0.204 0.182 0.683 1.542 6.658 0.130 0.260
## 10 0.011 0.233 0.212 0.597 0.912 3.670 0.142 0.294
## 12 0.024 0.280 0.278 0.568 0.325 2.699 0.184 0.350
## 16 0.042 0.375 0.320 0.975 0.810 2.804 0.191 0.533
## 19 0.132 0.513 0.446 0.983 0.265 1.756 0.296 0.731
## 26 0.164 0.708 0.756 1.000 -0.686 2.618 0.564 0.877
## 28 0.060 0.708 0.776 1.000 -0.681 2.421 0.558 0.912
## 32 0.088 0.436 0.408 0.931 0.460 2.307 0.277 0.582
## 34 0.059 0.394 0.364 0.954 0.588 2.552 0.216 0.542
## 35 0.022 0.377 0.293 0.952 0.570 2.141 0.175 0.578
## 37 0.035 0.352 0.259 0.912 0.783 2.269 0.145 0.488
## 38 0.062 0.346 0.324 0.948 0.995 3.280 0.175 0.441
## 41 0.044 0.304 0.266 0.899 0.879 3.612 0.170 0.409
## 42 0.044 0.305 0.281 0.825 0.869 3.538 0.169 0.397
## 43 0.031 0.272 0.258 0.752 0.658 3.105 0.161 0.347
## 45 0.035 0.236 0.174 0.703 0.958 2.730 0.111 0.363
## 48 0.008 0.109 0.094 0.334 1.102 3.696 0.053 0.135
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA.Predictors.Numeric[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "38 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 96 |
Number of columns | 60 |
_______________________ | |
Column type frequency: | |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | 0.03 | 0.03 | 0.00 | 0.01 | 0.02 | 0.04 | 0.14 | ▇▃▂▁▁ |
V2 | 0 | 1 | 0.04 | 0.03 | 0.00 | 0.02 | 0.03 | 0.06 | 0.16 | ▇▆▂▁▁ |
V3 | 0 | 1 | 0.05 | 0.03 | 0.00 | 0.02 | 0.04 | 0.06 | 0.17 | ▇▆▂▁▁ |
V4 | 0 | 1 | 0.06 | 0.04 | 0.01 | 0.03 | 0.05 | 0.07 | 0.16 | ▇▇▃▁▁ |
V5 | 0 | 1 | 0.08 | 0.05 | 0.01 | 0.05 | 0.07 | 0.11 | 0.25 | ▇▇▅▁▁ |
V6 | 0 | 1 | 0.12 | 0.06 | 0.01 | 0.08 | 0.11 | 0.15 | 0.38 | ▅▇▂▁▁ |
V7 | 0 | 1 | 0.13 | 0.06 | 0.02 | 0.09 | 0.13 | 0.17 | 0.37 | ▃▇▃▁▁ |
V8 | 0 | 1 | 0.15 | 0.09 | 0.01 | 0.10 | 0.14 | 0.19 | 0.46 | ▆▇▃▁▁ |
V9 | 0 | 1 | 0.20 | 0.12 | 0.01 | 0.13 | 0.18 | 0.26 | 0.68 | ▅▇▃▁▁ |
V10 | 0 | 1 | 0.23 | 0.13 | 0.01 | 0.14 | 0.21 | 0.29 | 0.60 | ▃▇▅▂▁ |
V11 | 0 | 1 | 0.27 | 0.12 | 0.05 | 0.19 | 0.25 | 0.33 | 0.67 | ▅▇▅▂▁ |
V12 | 0 | 1 | 0.28 | 0.13 | 0.02 | 0.18 | 0.28 | 0.35 | 0.57 | ▃▅▇▃▂ |
V13 | 0 | 1 | 0.30 | 0.13 | 0.06 | 0.21 | 0.29 | 0.37 | 0.71 | ▃▇▅▂▁ |
V14 | 0 | 1 | 0.31 | 0.17 | 0.03 | 0.19 | 0.29 | 0.41 | 1.00 | ▇▇▃▁▁ |
V15 | 0 | 1 | 0.32 | 0.21 | 0.01 | 0.17 | 0.28 | 0.44 | 0.91 | ▇▅▆▁▁ |
V16 | 0 | 1 | 0.38 | 0.23 | 0.04 | 0.19 | 0.32 | 0.53 | 0.98 | ▇▅▅▂▂ |
V17 | 0 | 1 | 0.41 | 0.25 | 0.04 | 0.21 | 0.32 | 0.65 | 1.00 | ▇▇▃▅▂ |
V18 | 0 | 1 | 0.45 | 0.25 | 0.04 | 0.24 | 0.37 | 0.67 | 0.93 | ▃▇▂▃▃ |
V19 | 0 | 1 | 0.51 | 0.25 | 0.13 | 0.30 | 0.45 | 0.73 | 0.98 | ▇▇▃▅▅ |
V20 | 0 | 1 | 0.59 | 0.26 | 0.07 | 0.40 | 0.62 | 0.80 | 1.00 | ▅▆▅▇▆ |
V21 | 0 | 1 | 0.64 | 0.27 | 0.05 | 0.44 | 0.69 | 0.84 | 1.00 | ▂▃▅▆▇ |
V22 | 0 | 1 | 0.64 | 0.27 | 0.02 | 0.40 | 0.70 | 0.85 | 1.00 | ▂▃▂▆▇ |
V23 | 0 | 1 | 0.65 | 0.26 | 0.06 | 0.45 | 0.71 | 0.87 | 1.00 | ▂▂▃▅▇ |
V24 | 0 | 1 | 0.68 | 0.24 | 0.05 | 0.58 | 0.70 | 0.90 | 1.00 | ▂▂▃▇▇ |
V25 | 0 | 1 | 0.68 | 0.25 | 0.02 | 0.57 | 0.72 | 0.87 | 1.00 | ▂▂▃▇▇ |
V26 | 0 | 1 | 0.71 | 0.22 | 0.16 | 0.56 | 0.76 | 0.88 | 1.00 | ▂▂▅▇▇ |
V27 | 0 | 1 | 0.71 | 0.25 | 0.10 | 0.50 | 0.79 | 0.91 | 1.00 | ▁▂▂▃▇ |
V28 | 0 | 1 | 0.71 | 0.25 | 0.06 | 0.56 | 0.78 | 0.91 | 1.00 | ▁▂▃▅▇ |
V29 | 0 | 1 | 0.65 | 0.25 | 0.01 | 0.47 | 0.71 | 0.87 | 1.00 | ▂▂▅▅▇ |
V30 | 0 | 1 | 0.59 | 0.22 | 0.06 | 0.41 | 0.60 | 0.72 | 1.00 | ▁▆▇▇▅ |
V31 | 0 | 1 | 0.50 | 0.23 | 0.10 | 0.33 | 0.44 | 0.65 | 0.97 | ▃▇▅▃▃ |
V32 | 0 | 1 | 0.44 | 0.22 | 0.09 | 0.28 | 0.41 | 0.58 | 0.93 | ▆▇▆▃▂ |
V33 | 0 | 1 | 0.41 | 0.21 | 0.05 | 0.24 | 0.39 | 0.54 | 1.00 | ▆▇▇▃▁ |
V34 | 0 | 1 | 0.39 | 0.22 | 0.06 | 0.22 | 0.36 | 0.54 | 0.95 | ▇▇▃▅▂ |
V35 | 0 | 1 | 0.38 | 0.26 | 0.02 | 0.17 | 0.29 | 0.58 | 0.95 | ▇▆▃▃▂ |
V36 | 0 | 1 | 0.36 | 0.28 | 0.01 | 0.14 | 0.28 | 0.53 | 1.00 | ▇▅▂▂▂ |
V37 | 0 | 1 | 0.35 | 0.26 | 0.04 | 0.14 | 0.26 | 0.49 | 0.91 | ▇▃▂▁▃ |
V38 | 0 | 1 | 0.35 | 0.22 | 0.06 | 0.17 | 0.32 | 0.44 | 0.95 | ▇▆▂▂▁ |
V39 | 0 | 1 | 0.35 | 0.20 | 0.04 | 0.18 | 0.31 | 0.48 | 0.97 | ▇▇▅▂▁ |
V40 | 0 | 1 | 0.32 | 0.18 | 0.02 | 0.20 | 0.28 | 0.43 | 0.93 | ▅▇▃▁▁ |
V41 | 0 | 1 | 0.30 | 0.17 | 0.04 | 0.17 | 0.27 | 0.41 | 0.90 | ▇▇▃▂▁ |
V42 | 0 | 1 | 0.31 | 0.18 | 0.04 | 0.17 | 0.28 | 0.40 | 0.82 | ▇▇▆▁▁ |
V43 | 0 | 1 | 0.27 | 0.15 | 0.03 | 0.16 | 0.26 | 0.35 | 0.75 | ▆▇▅▂▁ |
V44 | 0 | 1 | 0.24 | 0.14 | 0.03 | 0.14 | 0.19 | 0.31 | 0.58 | ▅▇▂▂▂ |
V45 | 0 | 1 | 0.24 | 0.17 | 0.04 | 0.11 | 0.17 | 0.36 | 0.70 | ▇▃▂▂▁ |
V46 | 0 | 1 | 0.19 | 0.16 | 0.01 | 0.08 | 0.14 | 0.23 | 0.73 | ▇▅▂▁▁ |
V47 | 0 | 1 | 0.14 | 0.10 | 0.02 | 0.08 | 0.11 | 0.18 | 0.55 | ▇▃▁▁▁ |
V48 | 0 | 1 | 0.11 | 0.07 | 0.01 | 0.05 | 0.09 | 0.14 | 0.33 | ▆▇▂▁▁ |
V49 | 0 | 1 | 0.06 | 0.04 | 0.01 | 0.03 | 0.05 | 0.09 | 0.20 | ▇▇▃▂▁ |
V50 | 0 | 1 | 0.02 | 0.02 | 0.00 | 0.01 | 0.02 | 0.03 | 0.08 | ▇▅▂▁▁ |
V51 | 0 | 1 | 0.02 | 0.01 | 0.00 | 0.01 | 0.02 | 0.02 | 0.10 | ▇▃▁▁▁ |
V52 | 0 | 1 | 0.02 | 0.01 | 0.00 | 0.01 | 0.01 | 0.02 | 0.07 | ▇▃▁▁▁ |
V53 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.02 | 0.04 | ▇▃▃▁▁ |
V54 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.01 | 0.01 | 0.02 | 0.04 | ▆▇▂▂▁ |
V55 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▃▁▁▁ |
V56 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▅▁▁▁ |
V57 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▅▁▁▁ |
V58 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▂▁▁▁ |
V59 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.03 | ▇▇▂▁▁ |
V60 | 0 | 1 | 0.01 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.02 | ▇▆▂▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 96 60
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA)) (DPA_Skimmed
Name | DPA |
Number of rows | 96 |
Number of columns | 61 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Class | 0 | 1 | FALSE | 2 | M: 78, R: 18 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | 0.03 | 0.03 | 0.00 | 0.01 | 0.02 | 0.04 | 0.14 | ▇▃▂▁▁ |
V2 | 0 | 1 | 0.04 | 0.03 | 0.00 | 0.02 | 0.03 | 0.06 | 0.16 | ▇▆▂▁▁ |
V3 | 0 | 1 | 0.05 | 0.03 | 0.00 | 0.02 | 0.04 | 0.06 | 0.17 | ▇▆▂▁▁ |
V4 | 0 | 1 | 0.06 | 0.04 | 0.01 | 0.03 | 0.05 | 0.07 | 0.16 | ▇▇▃▁▁ |
V5 | 0 | 1 | 0.08 | 0.05 | 0.01 | 0.05 | 0.07 | 0.11 | 0.25 | ▇▇▅▁▁ |
V6 | 0 | 1 | 0.12 | 0.06 | 0.01 | 0.08 | 0.11 | 0.15 | 0.38 | ▅▇▂▁▁ |
V7 | 0 | 1 | 0.13 | 0.06 | 0.02 | 0.09 | 0.13 | 0.17 | 0.37 | ▃▇▃▁▁ |
V8 | 0 | 1 | 0.15 | 0.09 | 0.01 | 0.10 | 0.14 | 0.19 | 0.46 | ▆▇▃▁▁ |
V9 | 0 | 1 | 0.20 | 0.12 | 0.01 | 0.13 | 0.18 | 0.26 | 0.68 | ▅▇▃▁▁ |
V10 | 0 | 1 | 0.23 | 0.13 | 0.01 | 0.14 | 0.21 | 0.29 | 0.60 | ▃▇▅▂▁ |
V11 | 0 | 1 | 0.27 | 0.12 | 0.05 | 0.19 | 0.25 | 0.33 | 0.67 | ▅▇▅▂▁ |
V12 | 0 | 1 | 0.28 | 0.13 | 0.02 | 0.18 | 0.28 | 0.35 | 0.57 | ▃▅▇▃▂ |
V13 | 0 | 1 | 0.30 | 0.13 | 0.06 | 0.21 | 0.29 | 0.37 | 0.71 | ▃▇▅▂▁ |
V14 | 0 | 1 | 0.31 | 0.17 | 0.03 | 0.19 | 0.29 | 0.41 | 1.00 | ▇▇▃▁▁ |
V15 | 0 | 1 | 0.32 | 0.21 | 0.01 | 0.17 | 0.28 | 0.44 | 0.91 | ▇▅▆▁▁ |
V16 | 0 | 1 | 0.38 | 0.23 | 0.04 | 0.19 | 0.32 | 0.53 | 0.98 | ▇▅▅▂▂ |
V17 | 0 | 1 | 0.41 | 0.25 | 0.04 | 0.21 | 0.32 | 0.65 | 1.00 | ▇▇▃▅▂ |
V18 | 0 | 1 | 0.45 | 0.25 | 0.04 | 0.24 | 0.37 | 0.67 | 0.93 | ▃▇▂▃▃ |
V19 | 0 | 1 | 0.51 | 0.25 | 0.13 | 0.30 | 0.45 | 0.73 | 0.98 | ▇▇▃▅▅ |
V20 | 0 | 1 | 0.59 | 0.26 | 0.07 | 0.40 | 0.62 | 0.80 | 1.00 | ▅▆▅▇▆ |
V21 | 0 | 1 | 0.64 | 0.27 | 0.05 | 0.44 | 0.69 | 0.84 | 1.00 | ▂▃▅▆▇ |
V22 | 0 | 1 | 0.64 | 0.27 | 0.02 | 0.40 | 0.70 | 0.85 | 1.00 | ▂▃▂▆▇ |
V23 | 0 | 1 | 0.65 | 0.26 | 0.06 | 0.45 | 0.71 | 0.87 | 1.00 | ▂▂▃▅▇ |
V24 | 0 | 1 | 0.68 | 0.24 | 0.05 | 0.58 | 0.70 | 0.90 | 1.00 | ▂▂▃▇▇ |
V25 | 0 | 1 | 0.68 | 0.25 | 0.02 | 0.57 | 0.72 | 0.87 | 1.00 | ▂▂▃▇▇ |
V26 | 0 | 1 | 0.71 | 0.22 | 0.16 | 0.56 | 0.76 | 0.88 | 1.00 | ▂▂▅▇▇ |
V27 | 0 | 1 | 0.71 | 0.25 | 0.10 | 0.50 | 0.79 | 0.91 | 1.00 | ▁▂▂▃▇ |
V28 | 0 | 1 | 0.71 | 0.25 | 0.06 | 0.56 | 0.78 | 0.91 | 1.00 | ▁▂▃▅▇ |
V29 | 0 | 1 | 0.65 | 0.25 | 0.01 | 0.47 | 0.71 | 0.87 | 1.00 | ▂▂▅▅▇ |
V30 | 0 | 1 | 0.59 | 0.22 | 0.06 | 0.41 | 0.60 | 0.72 | 1.00 | ▁▆▇▇▅ |
V31 | 0 | 1 | 0.50 | 0.23 | 0.10 | 0.33 | 0.44 | 0.65 | 0.97 | ▃▇▅▃▃ |
V32 | 0 | 1 | 0.44 | 0.22 | 0.09 | 0.28 | 0.41 | 0.58 | 0.93 | ▆▇▆▃▂ |
V33 | 0 | 1 | 0.41 | 0.21 | 0.05 | 0.24 | 0.39 | 0.54 | 1.00 | ▆▇▇▃▁ |
V34 | 0 | 1 | 0.39 | 0.22 | 0.06 | 0.22 | 0.36 | 0.54 | 0.95 | ▇▇▃▅▂ |
V35 | 0 | 1 | 0.38 | 0.26 | 0.02 | 0.17 | 0.29 | 0.58 | 0.95 | ▇▆▃▃▂ |
V36 | 0 | 1 | 0.36 | 0.28 | 0.01 | 0.14 | 0.28 | 0.53 | 1.00 | ▇▅▂▂▂ |
V37 | 0 | 1 | 0.35 | 0.26 | 0.04 | 0.14 | 0.26 | 0.49 | 0.91 | ▇▃▂▁▃ |
V38 | 0 | 1 | 0.35 | 0.22 | 0.06 | 0.17 | 0.32 | 0.44 | 0.95 | ▇▆▂▂▁ |
V39 | 0 | 1 | 0.35 | 0.20 | 0.04 | 0.18 | 0.31 | 0.48 | 0.97 | ▇▇▅▂▁ |
V40 | 0 | 1 | 0.32 | 0.18 | 0.02 | 0.20 | 0.28 | 0.43 | 0.93 | ▅▇▃▁▁ |
V41 | 0 | 1 | 0.30 | 0.17 | 0.04 | 0.17 | 0.27 | 0.41 | 0.90 | ▇▇▃▂▁ |
V42 | 0 | 1 | 0.31 | 0.18 | 0.04 | 0.17 | 0.28 | 0.40 | 0.82 | ▇▇▆▁▁ |
V43 | 0 | 1 | 0.27 | 0.15 | 0.03 | 0.16 | 0.26 | 0.35 | 0.75 | ▆▇▅▂▁ |
V44 | 0 | 1 | 0.24 | 0.14 | 0.03 | 0.14 | 0.19 | 0.31 | 0.58 | ▅▇▂▂▂ |
V45 | 0 | 1 | 0.24 | 0.17 | 0.04 | 0.11 | 0.17 | 0.36 | 0.70 | ▇▃▂▂▁ |
V46 | 0 | 1 | 0.19 | 0.16 | 0.01 | 0.08 | 0.14 | 0.23 | 0.73 | ▇▅▂▁▁ |
V47 | 0 | 1 | 0.14 | 0.10 | 0.02 | 0.08 | 0.11 | 0.18 | 0.55 | ▇▃▁▁▁ |
V48 | 0 | 1 | 0.11 | 0.07 | 0.01 | 0.05 | 0.09 | 0.14 | 0.33 | ▆▇▂▁▁ |
V49 | 0 | 1 | 0.06 | 0.04 | 0.01 | 0.03 | 0.05 | 0.09 | 0.20 | ▇▇▃▂▁ |
V50 | 0 | 1 | 0.02 | 0.02 | 0.00 | 0.01 | 0.02 | 0.03 | 0.08 | ▇▅▂▁▁ |
V51 | 0 | 1 | 0.02 | 0.01 | 0.00 | 0.01 | 0.02 | 0.02 | 0.10 | ▇▃▁▁▁ |
V52 | 0 | 1 | 0.02 | 0.01 | 0.00 | 0.01 | 0.01 | 0.02 | 0.07 | ▇▃▁▁▁ |
V53 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.02 | 0.04 | ▇▃▃▁▁ |
V54 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.01 | 0.01 | 0.02 | 0.04 | ▆▇▂▂▁ |
V55 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▃▁▁▁ |
V56 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▅▁▁▁ |
V57 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▅▁▁▁ |
V58 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.04 | ▇▂▁▁▁ |
V59 | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.03 | ▇▇▂▁▁ |
V60 | 0 | 1 | 0.01 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.02 | ▇▆▂▁▁ |
##################################
# Identifying columns with low variance
###################################
<- nearZeroVar(DPA,
DPA_LowVariance freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
$nzv,]) (DPA_LowVariance[DPA_LowVariance
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
else {
}
print(paste0("Low variance observed for ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
<- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
DPA_LowVarianceForRemoval
print(paste0("Low variance can be resolved by removing ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
<- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
DPA_LowVarianceRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LowVarianceRemovedVariable))
}
%>%
DPA skim() %>%
::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
dplyr
##################################
# Filtering out columns with low variance
#################################
<- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLowVariance))
(DPA_ExcludedLowVariance_Skimmed
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLowVariance)
}
## [1] "No low variance predictors noted."
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Visualizing pairwise correlation between predictors
##################################
<- cor.mtest(DPA.Predictors.Numeric,
DPA_CorrelationTest method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")
##################################
# Identifying the highly correlated variables
##################################
<- cor(DPA.Predictors.Numeric,
DPA_Correlation method = "pearson",
use="pairwise.complete.obs")
<- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95)) (DPA_HighlyCorrelatedCount
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
else {
} print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount)," pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
<- corr_cross(DPA.Predictors.Numeric,
(DPA_HighlyCorrelatedPairs max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
<- findCorrelation(DPA_Correlation, cutoff = 0.95)
DPA_HighlyCorrelated
<- length(DPA_HighlyCorrelated))
(DPA_HighlyCorrelatedForRemoval
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval)," numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
DPA_HighlyCorrelatedRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
<- DPA[,-DPA_HighlyCorrelated]
DPA_ExcludedHighCorrelation
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedHighCorrelation))
(DPA_ExcludedHighCorrelation_Skimmed
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedHighCorrelation)
}
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying the linearly dependent variables
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$linearCombos)) (DPA_LinearlyDependentCount
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
else {
} print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount)," subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
DPA_LinearlyDependentSubset print(paste0("Linear dependent variable(s) for subset ",
i," include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$remove)
DPA_LinearlyDependentForRemoval
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval)," numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
DPA_LinearlyDependentRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
<- DPA[,-DPA_LinearlyDependent$remove]
DPA_ExcludedLinearlyDependent
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLinearlyDependent))
(DPA_ExcludedLinearlyDependent_Skimmed
###################################
# Verifying the data dimensions
###################################
dim(DPA_ExcludedLinearlyDependent)
else {
}
###################################
# Verifying the data dimensions
###################################
dim(DPA)
}
## [1] 96 61
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Applying a Box-Cox transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCox <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
DPA_BoxCoxTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_BoxCoxTransformed)) (DPA_BoxCoxTransformedSkimmed
Name | DPA_BoxCoxTransformed |
Number of rows | 96 |
Number of columns | 60 |
_______________________ | |
Column type frequency: | |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | -2.61 | 0.38 | -3.64 | -2.88 | -2.64 | -2.38 | -1.64 | ▁▅▇▅▁ |
V2 | 0 | 1 | -2.11 | 0.31 | -2.97 | -2.32 | -2.13 | -1.91 | -1.42 | ▁▃▇▆▂ |
V3 | 0 | 1 | -2.07 | 0.28 | -2.86 | -2.24 | -2.08 | -1.89 | -1.39 | ▁▃▇▅▂ |
V4 | 0 | 1 | -1.98 | 0.27 | -2.62 | -2.17 | -1.98 | -1.82 | -1.39 | ▁▆▇▅▂ |
V5 | 0 | 1 | -1.62 | 0.22 | -2.16 | -1.77 | -1.62 | -1.47 | -1.07 | ▂▆▇▆▁ |
V6 | 0 | 1 | -1.34 | 0.18 | -1.80 | -1.44 | -1.33 | -1.23 | -0.76 | ▂▅▇▂▁ |
V7 | 0 | 1 | -1.18 | 0.14 | -1.52 | -1.26 | -1.18 | -1.09 | -0.74 | ▂▆▇▃▁ |
V8 | 0 | 1 | -1.37 | 0.27 | -2.07 | -1.52 | -1.38 | -1.21 | -0.67 | ▁▃▇▃▁ |
V9 | 0 | 1 | -1.23 | 0.31 | -2.15 | -1.40 | -1.24 | -1.04 | -0.35 | ▁▃▇▃▁ |
V10 | 0 | 1 | -1.07 | 0.26 | -1.79 | -1.25 | -1.08 | -0.92 | -0.46 | ▁▃▇▅▂ |
V11 | 0 | 1 | -1.00 | 0.24 | -1.54 | -1.12 | -1.00 | -0.85 | -0.37 | ▂▃▇▃▁ |
V12 | 0 | 1 | -0.86 | 0.19 | -1.32 | -0.99 | -0.85 | -0.74 | -0.47 | ▂▅▇▇▃ |
V13 | 0 | 1 | -0.93 | 0.25 | -1.50 | -1.08 | -0.92 | -0.78 | -0.31 | ▂▅▇▅▂ |
V14 | 0 | 1 | -1.05 | 0.39 | -2.20 | -1.32 | -1.03 | -0.79 | 0.00 | ▁▃▇▆▁ |
V15 | 0 | 1 | -1.08 | 0.48 | -2.52 | -1.38 | -1.07 | -0.73 | -0.09 | ▁▃▇▇▃ |
V16 | 0 | 1 | -1.01 | 0.51 | -2.35 | -1.41 | -1.02 | -0.59 | -0.03 | ▁▆▇▇▅ |
V17 | 0 | 1 | -0.94 | 0.53 | -2.42 | -1.34 | -1.03 | -0.42 | 0.00 | ▁▅▇▅▇ |
V18 | 0 | 1 | -0.76 | 0.42 | -1.83 | -1.08 | -0.81 | -0.37 | -0.07 | ▁▅▇▅▆ |
V19 | 0 | 1 | -0.64 | 0.39 | -1.39 | -0.96 | -0.69 | -0.29 | -0.02 | ▃▇▅▇▇ |
V20 | 0 | 1 | 0.59 | 0.26 | 0.07 | 0.40 | 0.62 | 0.80 | 1.00 | ▅▆▅▇▆ |
V21 | 0 | 1 | -0.33 | 0.23 | -0.81 | -0.52 | -0.30 | -0.15 | 0.00 | ▃▃▅▇▇ |
V22 | 0 | 1 | -0.34 | 0.24 | -0.82 | -0.56 | -0.29 | -0.15 | 0.00 | ▃▅▅▇▇ |
V23 | 0 | 1 | -0.30 | 0.20 | -0.70 | -0.48 | -0.27 | -0.13 | 0.00 | ▃▅▃▇▇ |
V24 | 0 | 1 | -0.27 | 0.18 | -0.66 | -0.37 | -0.28 | -0.10 | 0.00 | ▅▂▇▆▇ |
V25 | 0 | 1 | -0.27 | 0.18 | -0.66 | -0.38 | -0.26 | -0.12 | 0.00 | ▃▃▇▇▇ |
V26 | 0 | 1 | -0.24 | 0.16 | -0.56 | -0.37 | -0.22 | -0.12 | 0.00 | ▃▅▆▇▇ |
V27 | 0 | 1 | -0.24 | 0.19 | -0.61 | -0.42 | -0.19 | -0.09 | 0.00 | ▃▃▃▃▇ |
V28 | 0 | 1 | -0.24 | 0.19 | -0.62 | -0.38 | -0.21 | -0.09 | 0.00 | ▃▂▅▆▇ |
V29 | 0 | 1 | -0.31 | 0.21 | -0.77 | -0.48 | -0.28 | -0.13 | 0.00 | ▂▆▃▆▇ |
V30 | 0 | 1 | 0.59 | 0.22 | 0.06 | 0.41 | 0.60 | 0.72 | 1.00 | ▁▆▇▇▅ |
V31 | 0 | 1 | -0.66 | 0.35 | -1.50 | -0.90 | -0.70 | -0.40 | -0.03 | ▁▅▇▆▆ |
V32 | 0 | 1 | -0.76 | 0.37 | -1.56 | -1.00 | -0.75 | -0.49 | -0.07 | ▃▇▇▇▆ |
V33 | 0 | 1 | -0.77 | 0.33 | -1.56 | -1.03 | -0.76 | -0.53 | 0.00 | ▁▇▇▇▂ |
V34 | 0 | 1 | -0.85 | 0.40 | -1.70 | -1.14 | -0.83 | -0.54 | -0.05 | ▂▆▇▆▃ |
V35 | 0 | 1 | -0.92 | 0.50 | -1.95 | -1.26 | -0.97 | -0.49 | -0.05 | ▃▇▇▆▇ |
V36 | 0 | 1 | -1.03 | 0.59 | -2.55 | -1.49 | -1.06 | -0.57 | 0.00 | ▁▅▇▆▆ |
V37 | 0 | 1 | -1.34 | 0.81 | -3.35 | -1.93 | -1.35 | -0.72 | -0.09 | ▁▅▇▇▇ |
V38 | 0 | 1 | -1.27 | 0.67 | -2.78 | -1.74 | -1.13 | -0.82 | -0.05 | ▂▅▅▇▅ |
V39 | 0 | 1 | -0.99 | 0.42 | -2.03 | -1.33 | -1.00 | -0.66 | -0.03 | ▁▇▇▆▂ |
V40 | 0 | 1 | -0.92 | 0.32 | -1.70 | -1.11 | -0.94 | -0.69 | -0.07 | ▁▆▇▅▁ |
V41 | 0 | 1 | -1.08 | 0.40 | -2.03 | -1.38 | -1.09 | -0.78 | -0.10 | ▂▆▇▆▂ |
V42 | 0 | 1 | -1.01 | 0.37 | -1.78 | -1.27 | -1.00 | -0.77 | -0.19 | ▃▅▇▅▂ |
V43 | 0 | 1 | -1.00 | 0.29 | -1.65 | -1.20 | -0.98 | -0.82 | -0.27 | ▃▆▇▅▂ |
V44 | 0 | 1 | -1.36 | 0.47 | -2.60 | -1.63 | -1.41 | -1.05 | -0.52 | ▁▃▇▅▅ |
V45 | 0 | 1 | -1.70 | 0.74 | -3.35 | -2.20 | -1.75 | -1.02 | -0.35 | ▂▅▇▃▆ |
V46 | 0 | 1 | -1.57 | 0.56 | -3.10 | -1.95 | -1.60 | -1.28 | -0.31 | ▁▅▇▅▃ |
V47 | 0 | 1 | -2.16 | 0.66 | -4.02 | -2.56 | -2.22 | -1.70 | -0.59 | ▁▅▇▅▂ |
V48 | 0 | 1 | -1.70 | 0.35 | -2.55 | -1.95 | -1.70 | -1.50 | -0.93 | ▂▅▇▅▂ |
V49 | 0 | 1 | -1.93 | 0.29 | -2.57 | -2.13 | -1.94 | -1.71 | -1.28 | ▃▆▇▆▂ |
V50 | 0 | 1 | -3.94 | 0.64 | -5.43 | -4.34 | -3.95 | -3.54 | -2.49 | ▂▆▇▆▃ |
V51 | 0 | 1 | -2.37 | 0.21 | -2.93 | -2.49 | -2.38 | -2.25 | -1.66 | ▂▅▇▂▁ |
V52 | 0 | 1 | -4.40 | 0.66 | -6.65 | -4.74 | -4.41 | -4.00 | -2.65 | ▁▂▇▆▁ |
V53 | 0 | 1 | -2.52 | 0.18 | -2.99 | -2.65 | -2.55 | -2.38 | -2.10 | ▁▆▇▇▂ |
V54 | 0 | 1 | -2.51 | 0.19 | -2.91 | -2.64 | -2.50 | -2.39 | -2.11 | ▂▃▇▃▃ |
V55 | 0 | 1 | -4.95 | 0.78 | -6.81 | -5.60 | -4.87 | -4.37 | -3.11 | ▂▇▇▇▁ |
V56 | 0 | 1 | -2.58 | 0.17 | -3.01 | -2.68 | -2.58 | -2.46 | -2.07 | ▁▆▇▃▁ |
V57 | 0 | 1 | -5.11 | 0.71 | -6.81 | -5.60 | -5.16 | -4.54 | -3.34 | ▂▇▇▇▂ |
V58 | 0 | 1 | -5.07 | 0.79 | -7.01 | -5.63 | -5.07 | -4.58 | -3.12 | ▁▅▇▅▂ |
V59 | 0 | 1 | -2.59 | 0.18 | -3.12 | -2.72 | -2.58 | -2.48 | -2.18 | ▁▃▇▅▂ |
V60 | 0 | 1 | -3.27 | 0.25 | -3.87 | -3.43 | -3.26 | -3.09 | -2.67 | ▂▆▇▆▂ |
###################################
# Verifying the data dimensions
###################################
dim(DPA_BoxCoxTransformed)
## [1] 96 60
##################################
# Loading dataset
##################################
<- Sonar_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Applying a Box-Cox transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCox <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
DPA_BoxCoxTransformed
##################################
# Applying a center and scale data transformation
##################################
<- preProcess(DPA_BoxCoxTransformed, method = c("center","scale"))
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- predict(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_BoxCoxTransformed)
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)) (DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformedSkimmed
Name | DPA.Predictors.Numeric_Bo… |
Number of rows | 96 |
Number of columns | 60 |
_______________________ | |
Column type frequency: | |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | 0 | 1 | -2.69 | -0.72 | -0.09 | 0.61 | 2.54 | ▁▅▇▅▁ |
V2 | 0 | 1 | 0 | 1 | -2.81 | -0.68 | -0.05 | 0.66 | 2.26 | ▁▃▇▆▂ |
V3 | 0 | 1 | 0 | 1 | -2.79 | -0.60 | -0.02 | 0.65 | 2.41 | ▁▃▇▅▂ |
V4 | 0 | 1 | 0 | 1 | -2.40 | -0.71 | 0.00 | 0.63 | 2.23 | ▁▆▇▅▂ |
V5 | 0 | 1 | 0 | 1 | -2.44 | -0.69 | 0.02 | 0.67 | 2.49 | ▂▆▇▆▁ |
V6 | 0 | 1 | 0 | 1 | -2.57 | -0.58 | 0.06 | 0.62 | 3.20 | ▂▅▇▂▁ |
V7 | 0 | 1 | 0 | 1 | -2.33 | -0.57 | 0.04 | 0.62 | 3.08 | ▂▆▇▃▁ |
V8 | 0 | 1 | 0 | 1 | -2.54 | -0.55 | -0.01 | 0.59 | 2.56 | ▁▃▇▃▁ |
V9 | 0 | 1 | 0 | 1 | -2.90 | -0.52 | -0.01 | 0.60 | 2.79 | ▁▃▇▃▁ |
V10 | 0 | 1 | 0 | 1 | -2.73 | -0.67 | -0.03 | 0.59 | 2.34 | ▁▃▇▅▂ |
V11 | 0 | 1 | 0 | 1 | -2.29 | -0.53 | -0.01 | 0.63 | 2.64 | ▂▃▇▃▁ |
V12 | 0 | 1 | 0 | 1 | -2.42 | -0.70 | 0.06 | 0.59 | 2.02 | ▂▅▇▇▃ |
V13 | 0 | 1 | 0 | 1 | -2.30 | -0.60 | 0.05 | 0.60 | 2.47 | ▂▅▇▅▂ |
V14 | 0 | 1 | 0 | 1 | -2.92 | -0.68 | 0.05 | 0.67 | 2.67 | ▁▃▇▆▁ |
V15 | 0 | 1 | 0 | 1 | -3.02 | -0.65 | 0.01 | 0.73 | 2.07 | ▁▃▇▇▃ |
V16 | 0 | 1 | 0 | 1 | -2.63 | -0.78 | -0.01 | 0.83 | 1.95 | ▁▆▇▇▅ |
V17 | 0 | 1 | 0 | 1 | -2.82 | -0.77 | -0.17 | 0.99 | 1.78 | ▁▅▇▅▇ |
V18 | 0 | 1 | 0 | 1 | -2.53 | -0.76 | -0.13 | 0.94 | 1.64 | ▁▅▇▅▆ |
V19 | 0 | 1 | 0 | 1 | -1.93 | -0.82 | -0.12 | 0.90 | 1.62 | ▃▇▅▇▇ |
V20 | 0 | 1 | 0 | 1 | -2.01 | -0.73 | 0.14 | 0.82 | 1.60 | ▅▆▅▇▆ |
V21 | 0 | 1 | 0 | 1 | -2.03 | -0.80 | 0.17 | 0.78 | 1.43 | ▃▃▅▇▇ |
V22 | 0 | 1 | 0 | 1 | -2.06 | -0.93 | 0.21 | 0.80 | 1.42 | ▃▅▅▇▇ |
V23 | 0 | 1 | 0 | 1 | -1.95 | -0.86 | 0.18 | 0.87 | 1.50 | ▃▅▃▇▇ |
V24 | 0 | 1 | 0 | 1 | -2.09 | -0.53 | -0.02 | 0.94 | 1.49 | ▅▂▇▆▇ |
V25 | 0 | 1 | 0 | 1 | -2.13 | -0.59 | 0.08 | 0.82 | 1.48 | ▃▃▇▇▇ |
V26 | 0 | 1 | 0 | 1 | -1.99 | -0.77 | 0.12 | 0.77 | 1.51 | ▃▅▆▇▇ |
V27 | 0 | 1 | 0 | 1 | -1.94 | -0.95 | 0.26 | 0.83 | 1.29 | ▃▃▃▃▇ |
V28 | 0 | 1 | 0 | 1 | -2.01 | -0.73 | 0.19 | 0.84 | 1.30 | ▃▂▅▆▇ |
V29 | 0 | 1 | 0 | 1 | -2.16 | -0.81 | 0.18 | 0.88 | 1.50 | ▂▆▃▆▇ |
V30 | 0 | 1 | 0 | 1 | -2.42 | -0.80 | 0.07 | 0.61 | 1.90 | ▁▆▇▇▅ |
V31 | 0 | 1 | 0 | 1 | -2.40 | -0.69 | -0.11 | 0.73 | 1.77 | ▁▅▇▆▆ |
V32 | 0 | 1 | 0 | 1 | -2.15 | -0.65 | 0.02 | 0.74 | 1.87 | ▃▇▇▇▆ |
V33 | 0 | 1 | 0 | 1 | -2.40 | -0.79 | 0.03 | 0.71 | 2.31 | ▁▇▇▇▂ |
V34 | 0 | 1 | 0 | 1 | -2.14 | -0.75 | 0.04 | 0.76 | 2.02 | ▂▆▇▆▃ |
V35 | 0 | 1 | 0 | 1 | -2.08 | -0.68 | -0.11 | 0.85 | 1.74 | ▃▇▇▆▇ |
V36 | 0 | 1 | 0 | 1 | -2.57 | -0.78 | -0.04 | 0.78 | 1.75 | ▁▅▇▆▆ |
V37 | 0 | 1 | 0 | 1 | -2.48 | -0.73 | -0.01 | 0.77 | 1.54 | ▁▅▇▇▇ |
V38 | 0 | 1 | 0 | 1 | -2.25 | -0.70 | 0.22 | 0.67 | 1.81 | ▂▅▅▇▅ |
V39 | 0 | 1 | 0 | 1 | -2.48 | -0.81 | -0.01 | 0.79 | 2.29 | ▁▇▇▆▂ |
V40 | 0 | 1 | 0 | 1 | -2.46 | -0.62 | -0.07 | 0.71 | 2.67 | ▁▆▇▅▁ |
V41 | 0 | 1 | 0 | 1 | -2.35 | -0.73 | -0.03 | 0.74 | 2.42 | ▂▆▇▆▂ |
V42 | 0 | 1 | 0 | 1 | -2.09 | -0.71 | 0.04 | 0.65 | 2.23 | ▃▅▇▅▂ |
V43 | 0 | 1 | 0 | 1 | -2.23 | -0.68 | 0.05 | 0.60 | 2.51 | ▃▆▇▅▂ |
V44 | 0 | 1 | 0 | 1 | -2.66 | -0.58 | -0.10 | 0.67 | 1.81 | ▁▃▇▅▅ |
V45 | 0 | 1 | 0 | 1 | -2.21 | -0.67 | -0.06 | 0.93 | 1.82 | ▂▅▇▃▆ |
V46 | 0 | 1 | 0 | 1 | -2.72 | -0.67 | -0.05 | 0.53 | 2.27 | ▁▅▇▅▃ |
V47 | 0 | 1 | 0 | 1 | -2.81 | -0.60 | -0.08 | 0.70 | 2.37 | ▁▅▇▅▂ |
V48 | 0 | 1 | 0 | 1 | -2.44 | -0.72 | 0.01 | 0.57 | 2.21 | ▂▅▇▅▂ |
V49 | 0 | 1 | 0 | 1 | -2.21 | -0.69 | -0.02 | 0.79 | 2.26 | ▃▆▇▆▂ |
V50 | 0 | 1 | 0 | 1 | -2.31 | -0.61 | -0.02 | 0.62 | 2.25 | ▂▆▇▆▃ |
V51 | 0 | 1 | 0 | 1 | -2.60 | -0.53 | -0.01 | 0.59 | 3.36 | ▂▅▇▂▁ |
V52 | 0 | 1 | 0 | 1 | -3.42 | -0.52 | -0.02 | 0.60 | 2.65 | ▁▂▇▆▁ |
V53 | 0 | 1 | 0 | 1 | -2.66 | -0.76 | -0.19 | 0.78 | 2.34 | ▁▆▇▇▂ |
V54 | 0 | 1 | 0 | 1 | -2.12 | -0.69 | 0.01 | 0.62 | 2.05 | ▂▃▇▃▃ |
V55 | 0 | 1 | 0 | 1 | -2.39 | -0.83 | 0.11 | 0.75 | 2.37 | ▂▇▇▇▁ |
V56 | 0 | 1 | 0 | 1 | -2.63 | -0.61 | 0.00 | 0.74 | 3.08 | ▁▆▇▃▁ |
V57 | 0 | 1 | 0 | 1 | -2.40 | -0.69 | -0.07 | 0.81 | 2.49 | ▂▇▇▇▂ |
V58 | 0 | 1 | 0 | 1 | -2.44 | -0.70 | 0.01 | 0.62 | 2.45 | ▁▅▇▅▂ |
V59 | 0 | 1 | 0 | 1 | -2.88 | -0.69 | 0.06 | 0.62 | 2.25 | ▁▃▇▅▂ |
V60 | 0 | 1 | 0 | 1 | -2.43 | -0.65 | 0.02 | 0.71 | 2.39 | ▂▆▇▆▂ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)
## [1] 96 60
##################################
# Creating the pre-modelling
# train set
##################################
<- DPA$Class
Class <- DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA.Predictors.Numeric <- cbind(Class,PMA.Predictors.Numeric)
PMA_BoxCoxTransformed_CenteredScaledTransformed <- PMA_BoxCoxTransformed_CenteredScaledTransformed
PMA_PreModelling_Train
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling_Train)) (PMA_PreModelling_Train_Skimmed
Name | PMA_PreModelling_Train |
Number of rows | 96 |
Number of columns | 61 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Class | 0 | 1 | FALSE | 2 | M: 78, R: 18 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | 0 | 1 | -2.69 | -0.72 | -0.09 | 0.61 | 2.54 | ▁▅▇▅▁ |
V2 | 0 | 1 | 0 | 1 | -2.81 | -0.68 | -0.05 | 0.66 | 2.26 | ▁▃▇▆▂ |
V3 | 0 | 1 | 0 | 1 | -2.79 | -0.60 | -0.02 | 0.65 | 2.41 | ▁▃▇▅▂ |
V4 | 0 | 1 | 0 | 1 | -2.40 | -0.71 | 0.00 | 0.63 | 2.23 | ▁▆▇▅▂ |
V5 | 0 | 1 | 0 | 1 | -2.44 | -0.69 | 0.02 | 0.67 | 2.49 | ▂▆▇▆▁ |
V6 | 0 | 1 | 0 | 1 | -2.57 | -0.58 | 0.06 | 0.62 | 3.20 | ▂▅▇▂▁ |
V7 | 0 | 1 | 0 | 1 | -2.33 | -0.57 | 0.04 | 0.62 | 3.08 | ▂▆▇▃▁ |
V8 | 0 | 1 | 0 | 1 | -2.54 | -0.55 | -0.01 | 0.59 | 2.56 | ▁▃▇▃▁ |
V9 | 0 | 1 | 0 | 1 | -2.90 | -0.52 | -0.01 | 0.60 | 2.79 | ▁▃▇▃▁ |
V10 | 0 | 1 | 0 | 1 | -2.73 | -0.67 | -0.03 | 0.59 | 2.34 | ▁▃▇▅▂ |
V11 | 0 | 1 | 0 | 1 | -2.29 | -0.53 | -0.01 | 0.63 | 2.64 | ▂▃▇▃▁ |
V12 | 0 | 1 | 0 | 1 | -2.42 | -0.70 | 0.06 | 0.59 | 2.02 | ▂▅▇▇▃ |
V13 | 0 | 1 | 0 | 1 | -2.30 | -0.60 | 0.05 | 0.60 | 2.47 | ▂▅▇▅▂ |
V14 | 0 | 1 | 0 | 1 | -2.92 | -0.68 | 0.05 | 0.67 | 2.67 | ▁▃▇▆▁ |
V15 | 0 | 1 | 0 | 1 | -3.02 | -0.65 | 0.01 | 0.73 | 2.07 | ▁▃▇▇▃ |
V16 | 0 | 1 | 0 | 1 | -2.63 | -0.78 | -0.01 | 0.83 | 1.95 | ▁▆▇▇▅ |
V17 | 0 | 1 | 0 | 1 | -2.82 | -0.77 | -0.17 | 0.99 | 1.78 | ▁▅▇▅▇ |
V18 | 0 | 1 | 0 | 1 | -2.53 | -0.76 | -0.13 | 0.94 | 1.64 | ▁▅▇▅▆ |
V19 | 0 | 1 | 0 | 1 | -1.93 | -0.82 | -0.12 | 0.90 | 1.62 | ▃▇▅▇▇ |
V20 | 0 | 1 | 0 | 1 | -2.01 | -0.73 | 0.14 | 0.82 | 1.60 | ▅▆▅▇▆ |
V21 | 0 | 1 | 0 | 1 | -2.03 | -0.80 | 0.17 | 0.78 | 1.43 | ▃▃▅▇▇ |
V22 | 0 | 1 | 0 | 1 | -2.06 | -0.93 | 0.21 | 0.80 | 1.42 | ▃▅▅▇▇ |
V23 | 0 | 1 | 0 | 1 | -1.95 | -0.86 | 0.18 | 0.87 | 1.50 | ▃▅▃▇▇ |
V24 | 0 | 1 | 0 | 1 | -2.09 | -0.53 | -0.02 | 0.94 | 1.49 | ▅▂▇▆▇ |
V25 | 0 | 1 | 0 | 1 | -2.13 | -0.59 | 0.08 | 0.82 | 1.48 | ▃▃▇▇▇ |
V26 | 0 | 1 | 0 | 1 | -1.99 | -0.77 | 0.12 | 0.77 | 1.51 | ▃▅▆▇▇ |
V27 | 0 | 1 | 0 | 1 | -1.94 | -0.95 | 0.26 | 0.83 | 1.29 | ▃▃▃▃▇ |
V28 | 0 | 1 | 0 | 1 | -2.01 | -0.73 | 0.19 | 0.84 | 1.30 | ▃▂▅▆▇ |
V29 | 0 | 1 | 0 | 1 | -2.16 | -0.81 | 0.18 | 0.88 | 1.50 | ▂▆▃▆▇ |
V30 | 0 | 1 | 0 | 1 | -2.42 | -0.80 | 0.07 | 0.61 | 1.90 | ▁▆▇▇▅ |
V31 | 0 | 1 | 0 | 1 | -2.40 | -0.69 | -0.11 | 0.73 | 1.77 | ▁▅▇▆▆ |
V32 | 0 | 1 | 0 | 1 | -2.15 | -0.65 | 0.02 | 0.74 | 1.87 | ▃▇▇▇▆ |
V33 | 0 | 1 | 0 | 1 | -2.40 | -0.79 | 0.03 | 0.71 | 2.31 | ▁▇▇▇▂ |
V34 | 0 | 1 | 0 | 1 | -2.14 | -0.75 | 0.04 | 0.76 | 2.02 | ▂▆▇▆▃ |
V35 | 0 | 1 | 0 | 1 | -2.08 | -0.68 | -0.11 | 0.85 | 1.74 | ▃▇▇▆▇ |
V36 | 0 | 1 | 0 | 1 | -2.57 | -0.78 | -0.04 | 0.78 | 1.75 | ▁▅▇▆▆ |
V37 | 0 | 1 | 0 | 1 | -2.48 | -0.73 | -0.01 | 0.77 | 1.54 | ▁▅▇▇▇ |
V38 | 0 | 1 | 0 | 1 | -2.25 | -0.70 | 0.22 | 0.67 | 1.81 | ▂▅▅▇▅ |
V39 | 0 | 1 | 0 | 1 | -2.48 | -0.81 | -0.01 | 0.79 | 2.29 | ▁▇▇▆▂ |
V40 | 0 | 1 | 0 | 1 | -2.46 | -0.62 | -0.07 | 0.71 | 2.67 | ▁▆▇▅▁ |
V41 | 0 | 1 | 0 | 1 | -2.35 | -0.73 | -0.03 | 0.74 | 2.42 | ▂▆▇▆▂ |
V42 | 0 | 1 | 0 | 1 | -2.09 | -0.71 | 0.04 | 0.65 | 2.23 | ▃▅▇▅▂ |
V43 | 0 | 1 | 0 | 1 | -2.23 | -0.68 | 0.05 | 0.60 | 2.51 | ▃▆▇▅▂ |
V44 | 0 | 1 | 0 | 1 | -2.66 | -0.58 | -0.10 | 0.67 | 1.81 | ▁▃▇▅▅ |
V45 | 0 | 1 | 0 | 1 | -2.21 | -0.67 | -0.06 | 0.93 | 1.82 | ▂▅▇▃▆ |
V46 | 0 | 1 | 0 | 1 | -2.72 | -0.67 | -0.05 | 0.53 | 2.27 | ▁▅▇▅▃ |
V47 | 0 | 1 | 0 | 1 | -2.81 | -0.60 | -0.08 | 0.70 | 2.37 | ▁▅▇▅▂ |
V48 | 0 | 1 | 0 | 1 | -2.44 | -0.72 | 0.01 | 0.57 | 2.21 | ▂▅▇▅▂ |
V49 | 0 | 1 | 0 | 1 | -2.21 | -0.69 | -0.02 | 0.79 | 2.26 | ▃▆▇▆▂ |
V50 | 0 | 1 | 0 | 1 | -2.31 | -0.61 | -0.02 | 0.62 | 2.25 | ▂▆▇▆▃ |
V51 | 0 | 1 | 0 | 1 | -2.60 | -0.53 | -0.01 | 0.59 | 3.36 | ▂▅▇▂▁ |
V52 | 0 | 1 | 0 | 1 | -3.42 | -0.52 | -0.02 | 0.60 | 2.65 | ▁▂▇▆▁ |
V53 | 0 | 1 | 0 | 1 | -2.66 | -0.76 | -0.19 | 0.78 | 2.34 | ▁▆▇▇▂ |
V54 | 0 | 1 | 0 | 1 | -2.12 | -0.69 | 0.01 | 0.62 | 2.05 | ▂▃▇▃▃ |
V55 | 0 | 1 | 0 | 1 | -2.39 | -0.83 | 0.11 | 0.75 | 2.37 | ▂▇▇▇▁ |
V56 | 0 | 1 | 0 | 1 | -2.63 | -0.61 | 0.00 | 0.74 | 3.08 | ▁▆▇▃▁ |
V57 | 0 | 1 | 0 | 1 | -2.40 | -0.69 | -0.07 | 0.81 | 2.49 | ▂▇▇▇▂ |
V58 | 0 | 1 | 0 | 1 | -2.44 | -0.70 | 0.01 | 0.62 | 2.45 | ▁▅▇▅▂ |
V59 | 0 | 1 | 0 | 1 | -2.88 | -0.69 | 0.06 | 0.62 | 2.25 | ▁▃▇▅▂ |
V60 | 0 | 1 | 0 | 1 | -2.43 | -0.65 | 0.02 | 0.71 | 2.39 | ▂▆▇▆▂ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 96 61
##################################
# Formulating the test set
##################################
<- Sonar_Test
DPA_Test <- DPA_Test[,!names(DPA_Test) %in% c("Class")]
DPA_Test.Predictors <- DPA_Test.Predictors[,sapply(DPA_Test.Predictors, is.numeric)]
DPA_Test.Predictors.Numeric <- preProcess(DPA_Test.Predictors.Numeric, method = c("BoxCox"))
DPA_Test_BoxCox <- predict(DPA_Test_BoxCox, DPA_Test.Predictors.Numeric)
DPA_Test_BoxCoxTransformed <- preProcess(DPA_Test_BoxCoxTransformed, method = c("center","scale"))
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- predict(DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_Test_BoxCoxTransformed)
DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Creating the pre-modelling
# test set
##################################
<- DPA_Test$Class
Class <- DPA_Test.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA_Test.Predictors.Numeric <- cbind(Class,PMA_Test.Predictors.Numeric)
PMA_Test_BoxCoxTransformed_CenteredScaledTransformed <- PMA_Test_BoxCoxTransformed_CenteredScaledTransformed
PMA_PreModelling_Test
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling_Test)) (PMA_PreModelling_Test_Skimmed
Name | PMA_PreModelling_Test |
Number of rows | 40 |
Number of columns | 61 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 60 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Class | 0 | 1 | FALSE | 2 | M: 33, R: 7 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
V1 | 0 | 1 | 0 | 1 | -2.49 | -0.67 | -0.04 | 0.53 | 2.40 | ▁▃▇▃▁ |
V2 | 0 | 1 | 0 | 1 | -2.15 | -0.72 | 0.11 | 0.59 | 2.77 | ▂▆▇▃▁ |
V3 | 0 | 1 | 0 | 1 | -2.02 | -0.59 | 0.17 | 0.62 | 2.39 | ▃▃▇▃▁ |
V4 | 0 | 1 | 0 | 1 | -2.20 | -0.74 | -0.11 | 0.69 | 2.55 | ▂▇▇▅▁ |
V5 | 0 | 1 | 0 | 1 | -2.24 | -0.66 | 0.06 | 0.46 | 2.40 | ▂▅▇▃▂ |
V6 | 0 | 1 | 0 | 1 | -2.15 | -0.65 | -0.03 | 0.67 | 1.87 | ▂▅▇▆▅ |
V7 | 0 | 1 | 0 | 1 | -2.66 | -0.61 | -0.13 | 0.61 | 2.59 | ▁▅▇▅▁ |
V8 | 0 | 1 | 0 | 1 | -2.93 | -0.43 | -0.08 | 0.45 | 3.01 | ▁▂▇▂▁ |
V9 | 0 | 1 | 0 | 1 | -1.95 | -0.73 | -0.09 | 0.64 | 2.29 | ▃▆▇▅▂ |
V10 | 0 | 1 | 0 | 1 | -2.23 | -0.50 | -0.04 | 0.47 | 2.23 | ▂▃▇▃▁ |
V11 | 0 | 1 | 0 | 1 | -2.20 | -0.62 | 0.06 | 0.47 | 2.26 | ▂▃▇▂▂ |
V12 | 0 | 1 | 0 | 1 | -2.47 | -0.39 | -0.10 | 0.33 | 2.13 | ▁▂▇▃▂ |
V13 | 0 | 1 | 0 | 1 | -2.21 | -0.57 | -0.01 | 0.48 | 2.17 | ▂▃▇▃▂ |
V14 | 0 | 1 | 0 | 1 | -2.48 | -0.66 | 0.00 | 0.37 | 2.67 | ▁▃▇▃▁ |
V15 | 0 | 1 | 0 | 1 | -2.49 | -0.71 | 0.01 | 0.64 | 2.04 | ▁▅▇▅▃ |
V16 | 0 | 1 | 0 | 1 | -2.52 | -0.85 | -0.12 | 0.79 | 1.84 | ▁▆▆▇▃ |
V17 | 0 | 1 | 0 | 1 | -1.87 | -0.81 | -0.16 | 0.94 | 1.63 | ▂▇▅▃▇ |
V18 | 0 | 1 | 0 | 1 | -2.20 | -0.76 | -0.20 | 0.86 | 1.59 | ▁▆▇▃▇ |
V19 | 0 | 1 | 0 | 1 | -1.82 | -0.69 | 0.00 | 1.00 | 1.52 | ▃▇▆▂▇ |
V20 | 0 | 1 | 0 | 1 | -1.85 | -0.85 | 0.09 | 0.98 | 1.43 | ▃▃▅▅▇ |
V21 | 0 | 1 | 0 | 1 | -1.83 | -0.91 | 0.22 | 0.68 | 1.56 | ▅▂▅▇▆ |
V22 | 0 | 1 | 0 | 1 | -1.98 | -0.90 | 0.20 | 0.70 | 1.49 | ▃▃▃▇▅ |
V23 | 0 | 1 | 0 | 1 | -2.05 | -0.68 | 0.22 | 0.67 | 1.42 | ▃▃▅▇▇ |
V24 | 0 | 1 | 0 | 1 | -2.24 | -0.71 | 0.11 | 0.75 | 1.45 | ▂▅▆▇▇ |
V25 | 0 | 1 | 0 | 1 | -1.79 | -0.83 | 0.00 | 0.92 | 1.41 | ▃▂▆▃▇ |
V26 | 0 | 1 | 0 | 1 | -1.98 | -0.80 | 0.34 | 0.96 | 1.15 | ▂▃▂▂▇ |
V27 | 0 | 1 | 0 | 1 | -1.91 | -0.86 | 0.22 | 0.97 | 1.10 | ▂▃▂▂▇ |
V28 | 0 | 1 | 0 | 1 | -2.12 | -0.74 | 0.14 | 1.00 | 1.23 | ▂▃▅▃▇ |
V29 | 0 | 1 | 0 | 1 | -2.01 | -0.81 | 0.05 | 0.90 | 1.69 | ▃▇▅▇▆ |
V30 | 0 | 1 | 0 | 1 | -1.85 | -0.71 | 0.29 | 0.77 | 1.84 | ▃▇▃▇▃ |
V31 | 0 | 1 | 0 | 1 | -2.08 | -0.57 | -0.13 | 0.65 | 2.11 | ▂▅▇▃▃ |
V32 | 0 | 1 | 0 | 1 | -2.19 | -0.41 | 0.12 | 0.46 | 2.19 | ▂▂▇▂▂ |
V33 | 0 | 1 | 0 | 1 | -2.15 | -0.59 | 0.08 | 0.73 | 1.96 | ▂▃▇▆▂ |
V34 | 0 | 1 | 0 | 1 | -2.44 | -0.68 | -0.07 | 0.67 | 2.14 | ▁▅▇▆▃ |
V35 | 0 | 1 | 0 | 1 | -1.85 | -0.72 | 0.07 | 0.77 | 1.80 | ▅▅▅▇▃ |
V36 | 0 | 1 | 0 | 1 | -2.08 | -0.63 | -0.11 | 0.76 | 1.98 | ▃▃▇▇▂ |
V37 | 0 | 1 | 0 | 1 | -2.28 | -0.61 | 0.05 | 0.80 | 1.99 | ▂▃▇▆▂ |
V38 | 0 | 1 | 0 | 1 | -2.53 | -0.79 | 0.14 | 0.66 | 2.44 | ▁▇▇▇▂ |
V39 | 0 | 1 | 0 | 1 | -1.77 | -0.71 | 0.27 | 0.61 | 2.08 | ▆▅▇▇▂ |
V40 | 0 | 1 | 0 | 1 | -2.18 | -0.84 | 0.07 | 0.84 | 1.89 | ▁▇▇▇▂ |
V41 | 0 | 1 | 0 | 1 | -1.77 | -0.81 | 0.12 | 0.74 | 1.76 | ▆▃▅▇▃ |
V42 | 0 | 1 | 0 | 1 | -2.08 | -0.74 | -0.17 | 1.01 | 1.94 | ▂▇▇▅▃ |
V43 | 0 | 1 | 0 | 1 | -2.14 | -0.69 | -0.15 | 0.99 | 1.59 | ▂▅▇▂▇ |
V44 | 0 | 1 | 0 | 1 | -2.40 | -0.56 | -0.04 | 0.61 | 2.01 | ▁▃▇▅▃ |
V45 | 0 | 1 | 0 | 1 | -2.22 | -0.70 | -0.04 | 0.43 | 1.92 | ▁▇▇▅▅ |
V46 | 0 | 1 | 0 | 1 | -2.34 | -0.68 | -0.03 | 0.66 | 2.13 | ▁▇▇▇▃ |
V47 | 0 | 1 | 0 | 1 | -2.06 | -0.61 | 0.07 | 0.66 | 2.28 | ▂▆▇▆▁ |
V48 | 0 | 1 | 0 | 1 | -2.05 | -0.70 | -0.05 | 0.70 | 2.07 | ▃▆▇▆▂ |
V49 | 0 | 1 | 0 | 1 | -1.83 | -0.75 | 0.05 | 0.78 | 1.77 | ▅▅▅▇▃ |
V50 | 0 | 1 | 0 | 1 | -1.81 | -0.60 | 0.20 | 0.57 | 2.59 | ▃▃▇▂▁ |
V51 | 0 | 1 | 0 | 1 | -2.15 | -0.57 | 0.02 | 0.57 | 2.09 | ▂▃▇▃▂ |
V52 | 0 | 1 | 0 | 1 | -2.12 | -0.61 | 0.01 | 0.50 | 2.17 | ▃▆▇▅▃ |
V53 | 0 | 1 | 0 | 1 | -2.04 | -0.36 | 0.05 | 0.51 | 2.65 | ▃▆▇▃▁ |
V54 | 0 | 1 | 0 | 1 | -2.06 | -0.68 | -0.15 | 0.72 | 1.92 | ▃▆▇▇▅ |
V55 | 0 | 1 | 0 | 1 | -1.93 | -0.71 | -0.17 | 0.77 | 1.84 | ▃▆▇▇▅ |
V56 | 0 | 1 | 0 | 1 | -2.19 | -0.53 | 0.06 | 0.62 | 1.91 | ▂▃▇▅▃ |
V57 | 0 | 1 | 0 | 1 | -2.34 | -0.64 | 0.04 | 0.60 | 1.86 | ▂▃▇▇▅ |
V58 | 0 | 1 | 0 | 1 | -2.16 | -0.60 | -0.08 | 0.77 | 2.49 | ▃▇▇▇▁ |
V59 | 0 | 1 | 0 | 1 | -2.56 | -0.74 | -0.21 | 0.70 | 2.43 | ▁▅▇▆▁ |
V60 | 0 | 1 | 0 | 1 | -1.76 | -0.83 | -0.06 | 0.69 | 2.55 | ▅▇▇▅▁ |
###################################
# Verifying the data dimensions
# for the test set
###################################
dim(PMA_PreModelling_Test)
## [1] 40 61
##################################
# Loading dataset
##################################
<- PMA_PreModelling_Train
EDA
##################################
# Listing all predictors
##################################
<- EDA[,!names(EDA) %in% c("Class")]
EDA.Predictors
##################################
# Listing all numeric predictors
##################################
<- EDA.Predictors[,sapply(EDA.Predictors, is.numeric)]
EDA.Predictors.Numeric ncol(EDA.Predictors.Numeric)
## [1] 60
names(EDA.Predictors.Numeric)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" "V12"
## [13] "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22" "V23" "V24"
## [25] "V25" "V26" "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35" "V36"
## [37] "V37" "V38" "V39" "V40" "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48"
## [49] "V49" "V50" "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60"
##################################
# Formulating the box plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|")
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train$Class)
##
## M R
## 78 18
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train[,!names(PMA_PreModelling_Train) %in% c("Class")],
BTREE_REF y = PMA_PreModelling_Train$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_REF
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7815179 0.9571429 0.33
$finalModel BTREE_REF
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_REF
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.7815179 0.9571429 0.33 0.2135526 0.08822253 0.3441938
<- BTREE_REF$results$ROC) (BTREE_REF_Train_ROCCurveAUC
## [1] 0.7815179
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_REF, scale = TRUE)
BTREE_REF_VarImp plot(BTREE_REF_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (REF)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_REF_Test BTREE_Predicted = predict(BTREE_REF,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_REF_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.60 0.40
## 2 M 0.88 0.12
## 3 M 0.70 0.30
## 4 M 0.86 0.14
## 5 M 0.64 0.36
## 6 M 0.80 0.20
## 7 M 0.74 0.26
## 8 M 0.62 0.38
## 9 M 0.94 0.06
## 10 M 0.90 0.10
## 11 M 0.76 0.24
## 12 M 0.96 0.04
## 13 M 0.74 0.26
## 14 M 0.76 0.24
## 15 M 0.74 0.26
## 16 M 0.88 0.12
## 17 M 0.88 0.12
## 18 M 0.54 0.46
## 19 M 0.96 0.04
## 20 M 0.86 0.14
## 21 M 0.78 0.22
## 22 M 0.82 0.18
## 23 M 0.84 0.16
## 24 M 0.98 0.02
## 25 M 1.00 0.00
## 26 M 0.92 0.08
## 27 M 0.98 0.02
## 28 M 1.00 0.00
## 29 M 0.84 0.16
## 30 M 0.98 0.02
## 31 M 0.90 0.10
## 32 M 0.82 0.18
## 33 M 0.82 0.18
## 34 R 0.82 0.18
## 35 R 0.56 0.44
## 36 R 0.74 0.26
## 37 R 0.90 0.10
## 38 R 0.82 0.18
## 39 R 0.40 0.60
## 40 R 0.94 0.06
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_REF_Test$BTREE_Observed,
BTREE_REF_Test_ROC predictor = BTREE_REF_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_REF_Test$BTREE_Observed)))
<- auc(BTREE_REF_Test_ROC)[1]) (BTREE_REF_Test_ROCCurveAUC
## [1] 0.6406926
##################################
# Conducting random undersampling
##################################
set.seed(12345678)
<- downSample(x = PMA_PreModelling_Train[,-ncol(PMA_PreModelling_Train)],
PMA_PreModelling_Train_RU_OUT y = PMA_PreModelling_Train$Class)
table(PMA_PreModelling_Train_RU_OUT$Class)
##
## M R
## 18 18
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_RU_OUT[,!names(PMA_PreModelling_Train_RU_OUT) %in% c("Class")],
BTREE_RU_OUT y = PMA_PreModelling_Train_RU_OUT$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_RU_OUT
## Bagged CART
##
## 36 samples
## 59 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 33, 32, 32, 33, 33, 32, ...
## Resampling results:
##
## ROC Sens Spec
## 0.82 0.74 0.65
$finalModel BTREE_RU_OUT
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_RU_OUT
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.82 0.74 0.65 0.237117 0.3534091 0.3535534
<- BTREE_RU_OUT$results$ROC) (BTREE_RU_OUT_Train_ROCCurveAUC
## [1] 0.82
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_RU_OUT, scale = TRUE)
BTREE_RU_OUT_VarImp plot(BTREE_RU_OUT_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (RU_OUT)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_RU_OUT_Test BTREE_Predicted = predict(BTREE_RU_OUT,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_RU_OUT_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.32 0.68
## 2 M 0.58 0.42
## 3 M 0.34 0.66
## 4 M 0.68 0.32
## 5 M 0.60 0.40
## 6 M 0.58 0.42
## 7 M 0.40 0.60
## 8 M 0.34 0.66
## 9 M 0.26 0.74
## 10 M 0.50 0.50
## 11 M 0.80 0.20
## 12 M 0.52 0.48
## 13 M 0.72 0.28
## 14 M 0.68 0.32
## 15 M 0.68 0.32
## 16 M 0.64 0.36
## 17 M 0.66 0.34
## 18 M 0.44 0.56
## 19 M 0.62 0.38
## 20 M 0.70 0.30
## 21 M 0.74 0.26
## 22 M 0.72 0.28
## 23 M 0.58 0.42
## 24 M 0.48 0.52
## 25 M 0.98 0.02
## 26 M 0.74 0.26
## 27 M 0.68 0.32
## 28 M 0.94 0.06
## 29 M 0.68 0.32
## 30 M 0.90 0.10
## 31 M 0.88 0.12
## 32 M 0.78 0.22
## 33 M 0.62 0.38
## 34 R 0.48 0.52
## 35 R 0.32 0.68
## 36 R 0.26 0.74
## 37 R 0.38 0.62
## 38 R 0.34 0.66
## 39 R 0.12 0.88
## 40 R 0.44 0.56
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_RU_OUT_Test$BTREE_Observed,
BTREE_RU_OUT_Test_ROC predictor = BTREE_RU_OUT_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_RU_OUT_Test$BTREE_Observed)))
<- auc(BTREE_RU_OUT_Test_ROC)[1]) (BTREE_RU_OUT_Test_ROCCurveAUC
## [1] 0.9090909
##################################
# Conducting random oversampling
##################################
set.seed(12345678)
<- upSample(x = PMA_PreModelling_Train[,-ncol(PMA_PreModelling_Train)],
PMA_PreModelling_Train_RO_OUT y = PMA_PreModelling_Train$Class)
table(PMA_PreModelling_Train_RO_OUT$Class)
##
## M R
## 78 78
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_RO_OUT[,!names(PMA_PreModelling_Train_RO_OUT) %in% c("Class")],
BTREE_RO_OUT y = PMA_PreModelling_Train_RO_OUT$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_RO_OUT
## Bagged CART
##
## 156 samples
## 59 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 140, 141, 140, 141, 141, 140, ...
## Resampling results:
##
## ROC Sens Spec
## 1 0.9764286 1
$finalModel BTREE_RO_OUT
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_RO_OUT
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 1 0.9764286 1 0 0.0568643 0
<- BTREE_RO_OUT$results$ROC) (BTREE_RO_OUT_Train_ROCCurveAUC
## [1] 1
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_RO_OUT, scale = TRUE)
BTREE_RO_OUT_VarImp plot(BTREE_RO_OUT_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (RO_OUT)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_RO_OUT_Test BTREE_Predicted = predict(BTREE_RO_OUT,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_RO_OUT_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.64 0.36
## 2 M 0.78 0.22
## 3 M 0.28 0.72
## 4 M 0.62 0.38
## 5 M 0.66 0.34
## 6 M 0.72 0.28
## 7 M 0.86 0.14
## 8 M 0.82 0.18
## 9 M 0.94 0.06
## 10 M 0.94 0.06
## 11 M 0.84 0.16
## 12 M 1.00 0.00
## 13 M 0.84 0.16
## 14 M 0.74 0.26
## 15 M 0.88 0.12
## 16 M 0.78 0.22
## 17 M 0.92 0.08
## 18 M 0.90 0.10
## 19 M 0.92 0.08
## 20 M 0.92 0.08
## 21 M 0.58 0.42
## 22 M 0.98 0.02
## 23 M 0.76 0.24
## 24 M 0.54 0.46
## 25 M 1.00 0.00
## 26 M 0.84 0.16
## 27 M 0.92 0.08
## 28 M 1.00 0.00
## 29 M 0.82 0.18
## 30 M 1.00 0.00
## 31 M 0.96 0.04
## 32 M 0.94 0.06
## 33 M 0.58 0.42
## 34 R 0.82 0.18
## 35 R 0.74 0.26
## 36 R 0.80 0.20
## 37 R 0.88 0.12
## 38 R 0.54 0.46
## 39 R 0.20 0.80
## 40 R 0.94 0.06
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_RO_OUT_Test$BTREE_Observed,
BTREE_RO_OUT_Test_ROC predictor = BTREE_RO_OUT_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_RO_OUT_Test$BTREE_Observed)))
<- auc(BTREE_RO_OUT_Test_ROC)[1]) (BTREE_RO_OUT_Test_ROCCurveAUC
## [1] 0.6580087
##################################
# Conducting synthetic minority oversampling technique
##################################
set.seed(12345678)
<- SMOTE(Class ~ .,
PMA_PreModelling_Train_SMOTE_OUT data = PMA_PreModelling_Train)
table(PMA_PreModelling_Train_SMOTE_OUT$Class)
##
## M R
## 72 54
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_SMOTE_OUT[,!names(PMA_PreModelling_Train_SMOTE_OUT) %in% c("Class")],
BTREE_SMOTE_OUT y = PMA_PreModelling_Train_SMOTE_OUT$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_SMOTE_OUT
## Bagged CART
##
## 126 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 113, 113, 114, 113, 114, 113, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9911548 0.9367857 0.9573333
$finalModel BTREE_SMOTE_OUT
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_SMOTE_OUT
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.9911548 0.9367857 0.9573333 0.01797308 0.08439291 0.09666549
<- BTREE_SMOTE_OUT$results$ROC) (BTREE_SMOTE_OUT_Train_ROCCurveAUC
## [1] 0.9911548
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_SMOTE_OUT, scale = TRUE)
BTREE_SMOTE_OUT_VarImp plot(BTREE_SMOTE_OUT_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (SMOTE_OUT)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_SMOTE_OUT_Test BTREE_Predicted = predict(BTREE_SMOTE_OUT,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_SMOTE_OUT_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.58 0.42
## 2 M 0.74 0.26
## 3 M 0.62 0.38
## 4 M 0.46 0.54
## 5 M 0.54 0.46
## 6 M 0.58 0.42
## 7 M 0.60 0.40
## 8 M 0.70 0.30
## 9 M 0.80 0.20
## 10 M 0.96 0.04
## 11 M 0.66 0.34
## 12 M 0.90 0.10
## 13 M 0.86 0.14
## 14 M 0.72 0.28
## 15 M 0.74 0.26
## 16 M 0.90 0.10
## 17 M 0.86 0.14
## 18 M 0.58 0.42
## 19 M 0.68 0.32
## 20 M 0.96 0.04
## 21 M 0.32 0.68
## 22 M 0.76 0.24
## 23 M 0.70 0.30
## 24 M 0.60 0.40
## 25 M 1.00 0.00
## 26 M 0.92 0.08
## 27 M 0.96 0.04
## 28 M 1.00 0.00
## 29 M 0.82 0.18
## 30 M 1.00 0.00
## 31 M 1.00 0.00
## 32 M 0.70 0.30
## 33 M 0.46 0.54
## 34 R 0.70 0.30
## 35 R 0.52 0.48
## 36 R 0.48 0.52
## 37 R 0.72 0.28
## 38 R 0.78 0.22
## 39 R 0.16 0.84
## 40 R 0.68 0.32
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_SMOTE_OUT_Test$BTREE_Observed,
BTREE_SMOTE_OUT_Test_ROC predictor = BTREE_SMOTE_OUT_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_SMOTE_OUT_Test$BTREE_Observed)))
<- auc(BTREE_SMOTE_OUT_Test_ROC)[1]) (BTREE_SMOTE_OUT_Test_ROCCurveAUC
## [1] 0.7164502
##################################
# Conducting random oversampling examples
##################################
set.seed(12345678)
<- ROSE(Class ~ .,data = PMA_PreModelling_Train)$data
PMA_PreModelling_Train_ROSE_OUT table(PMA_PreModelling_Train_ROSE_OUT$Class)
##
## M R
## 47 49
names(PMA_PreModelling_Train_ROSE_OUT)
## [1] "Class" "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8"
## [10] "V9" "V10" "V11" "V12" "V13" "V14" "V15" "V16" "V17"
## [19] "V18" "V19" "V20" "V21" "V22" "V23" "V24" "V25" "V26"
## [28] "V27" "V28" "V29" "V30" "V31" "V32" "V33" "V34" "V35"
## [37] "V36" "V37" "V38" "V39" "V40" "V41" "V42" "V43" "V44"
## [46] "V45" "V46" "V47" "V48" "V49" "V50" "V51" "V52" "V53"
## [55] "V54" "V55" "V56" "V57" "V58" "V59" "V60"
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_ROSE_OUT[,!names(PMA_PreModelling_Train_ROSE_OUT) %in% c("Class")],
BTREE_ROSE_OUT y = PMA_PreModelling_Train_ROSE_OUT$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_ROSE_OUT
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 87, 86, 86, 87, 87, 86, ...
## Resampling results:
##
## ROC Sens Spec
## 0.863725 0.762 0.74
$finalModel BTREE_ROSE_OUT
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_ROSE_OUT
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.863725 0.762 0.74 0.1143333 0.235294 0.1956152
<- BTREE_ROSE_OUT$results$ROC) (BTREE_ROSE_OUT_Train_ROCCurveAUC
## [1] 0.863725
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_ROSE_OUT, scale = TRUE)
BTREE_ROSE_OUT_VarImp plot(BTREE_ROSE_OUT_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (ROSE_OUT)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_ROSE_OUT_Test BTREE_Predicted = predict(BTREE_ROSE_OUT,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_ROSE_OUT_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.40 0.60
## 2 M 0.64 0.36
## 3 M 0.54 0.46
## 4 M 0.50 0.50
## 5 M 0.36 0.64
## 6 M 0.48 0.52
## 7 M 0.36 0.64
## 8 M 0.46 0.54
## 9 M 0.38 0.62
## 10 M 0.48 0.52
## 11 M 0.94 0.06
## 12 M 0.78 0.22
## 13 M 0.88 0.12
## 14 M 0.52 0.48
## 15 M 0.38 0.62
## 16 M 0.38 0.62
## 17 M 0.88 0.12
## 18 M 0.82 0.18
## 19 M 0.72 0.28
## 20 M 0.90 0.10
## 21 M 0.34 0.66
## 22 M 0.80 0.20
## 23 M 0.46 0.54
## 24 M 0.46 0.54
## 25 M 0.96 0.04
## 26 M 0.72 0.28
## 27 M 0.60 0.40
## 28 M 0.94 0.06
## 29 M 0.64 0.36
## 30 M 0.94 0.06
## 31 M 0.88 0.12
## 32 M 0.86 0.14
## 33 M 0.28 0.72
## 34 R 0.62 0.38
## 35 R 0.24 0.76
## 36 R 0.22 0.78
## 37 R 0.50 0.50
## 38 R 0.40 0.60
## 39 R 0.10 0.90
## 40 R 0.44 0.56
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_ROSE_OUT_Test$BTREE_Observed,
BTREE_ROSE_OUT_Test_ROC predictor = BTREE_ROSE_OUT_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_ROSE_OUT_Test$BTREE_Observed)))
<- auc(BTREE_ROSE_OUT_Test_ROC)[1]) (BTREE_ROSE_OUT_Test_ROCCurveAUC
## [1] 0.8008658
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "down")
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_RU_IN set.seed(12345678)
<- train(x = PMA_PreModelling_Train_RU_IN[,!names(PMA_PreModelling_Train_RU_IN) %in% c("Class")],
BTREE_RU_IN y = PMA_PreModelling_Train_RU_IN$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_RU_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using down-sampling
##
## Resampling results:
##
## ROC Sens Spec
## 0.7908929 0.7496429 0.69
$finalModel BTREE_RU_IN
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_RU_IN
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.7908929 0.7496429 0.69 0.2427919 0.1918018 0.3483196
<- BTREE_RU_IN$results$ROC) (BTREE_RU_IN_Train_ROCCurveAUC
## [1] 0.7908929
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_RU_IN, scale = TRUE)
BTREE_RU_IN_VarImp plot(BTREE_RU_IN_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (RU_IN)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_RU_IN_Test BTREE_Predicted = predict(BTREE_RU_IN,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_RU_IN_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.58 0.42
## 2 M 0.82 0.18
## 3 M 0.56 0.44
## 4 M 0.72 0.28
## 5 M 0.28 0.72
## 6 M 0.48 0.52
## 7 M 0.28 0.72
## 8 M 0.36 0.64
## 9 M 0.38 0.62
## 10 M 0.72 0.28
## 11 M 0.72 0.28
## 12 M 0.68 0.32
## 13 M 0.60 0.40
## 14 M 0.92 0.08
## 15 M 0.82 0.18
## 16 M 0.82 0.18
## 17 M 0.86 0.14
## 18 M 0.58 0.42
## 19 M 0.56 0.44
## 20 M 0.60 0.40
## 21 M 0.36 0.64
## 22 M 0.54 0.46
## 23 M 0.40 0.60
## 24 M 0.54 0.46
## 25 M 0.92 0.08
## 26 M 0.88 0.12
## 27 M 0.82 0.18
## 28 M 0.82 0.18
## 29 M 0.76 0.24
## 30 M 0.82 0.18
## 31 M 0.98 0.02
## 32 M 0.88 0.12
## 33 M 0.58 0.42
## 34 R 0.46 0.54
## 35 R 0.20 0.80
## 36 R 0.34 0.66
## 37 R 0.44 0.56
## 38 R 0.44 0.56
## 39 R 0.18 0.82
## 40 R 0.42 0.58
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_RU_IN_Test$BTREE_Observed,
BTREE_RU_IN_Test_ROC predictor = BTREE_RU_IN_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_RU_IN_Test$BTREE_Observed)))
<- auc(BTREE_RU_IN_Test_ROC)[1]) (BTREE_RU_IN_Test_ROCCurveAUC
## [1] 0.8874459
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "up")
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_RO_IN set.seed(12345678)
<- train(x = PMA_PreModelling_Train_RO_IN[,!names(PMA_PreModelling_Train_RO_IN) %in% c("Class")],
BTREE_RO_IN y = PMA_PreModelling_Train_RO_IN$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_RO_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## ROC Sens Spec
## 0.8607143 0.9682143 0.44
$finalModel BTREE_RO_IN
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_RO_IN
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8607143 0.9682143 0.44 0.1767537 0.06262119 0.3994895
<- BTREE_RO_IN$results$ROC) (BTREE_RO_IN_Train_ROCCurveAUC
## [1] 0.8607143
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_RO_IN, scale = TRUE)
BTREE_RO_IN_VarImp plot(BTREE_RO_IN_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (RO_IN)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_RO_IN_Test BTREE_Predicted = predict(BTREE_RO_IN,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_RO_IN_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.72 0.28
## 2 M 0.80 0.20
## 3 M 0.56 0.44
## 4 M 0.62 0.38
## 5 M 0.66 0.34
## 6 M 0.90 0.10
## 7 M 0.88 0.12
## 8 M 0.86 0.14
## 9 M 0.94 0.06
## 10 M 0.92 0.08
## 11 M 0.82 0.18
## 12 M 0.94 0.06
## 13 M 0.86 0.14
## 14 M 0.82 0.18
## 15 M 0.94 0.06
## 16 M 0.86 0.14
## 17 M 0.88 0.12
## 18 M 0.88 0.12
## 19 M 0.86 0.14
## 20 M 0.98 0.02
## 21 M 0.26 0.74
## 22 M 0.92 0.08
## 23 M 0.84 0.16
## 24 M 0.64 0.36
## 25 M 1.00 0.00
## 26 M 0.94 0.06
## 27 M 0.96 0.04
## 28 M 1.00 0.00
## 29 M 0.86 0.14
## 30 M 0.98 0.02
## 31 M 0.96 0.04
## 32 M 0.96 0.04
## 33 M 0.62 0.38
## 34 R 0.86 0.14
## 35 R 0.62 0.38
## 36 R 0.72 0.28
## 37 R 0.86 0.14
## 38 R 0.74 0.26
## 39 R 0.20 0.80
## 40 R 0.90 0.10
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_RO_IN_Test$BTREE_Observed,
BTREE_RO_IN_Test_ROC predictor = BTREE_RO_IN_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_RO_IN_Test$BTREE_Observed)))
<- auc(BTREE_RO_IN_Test_ROC)[1]) (BTREE_RO_IN_Test_ROCCurveAUC
## [1] 0.7272727
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "smote")
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_SMOTE_IN set.seed(12345678)
<- train(x = PMA_PreModelling_Train_SMOTE_IN[,!names(PMA_PreModelling_Train_SMOTE_IN) %in% c("Class")],
BTREE_SMOTE_IN y = PMA_PreModelling_Train_SMOTE_IN$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_SMOTE_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using SMOTE
##
## Resampling results:
##
## ROC Sens Spec
## 0.8502679 0.9596429 0.46
$finalModel BTREE_SMOTE_IN
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_SMOTE_IN
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8502679 0.9596429 0.46 0.1930138 0.06937744 0.4020356
<- BTREE_SMOTE_IN$results$ROC) (BTREE_SMOTE_IN_Train_ROCCurveAUC
## [1] 0.8502679
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_SMOTE_IN, scale = TRUE)
BTREE_SMOTE_IN_VarImp plot(BTREE_SMOTE_IN_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (SMOTE_IN)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_SMOTE_IN_Test BTREE_Predicted = predict(BTREE_SMOTE_IN,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_SMOTE_IN_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.64 0.36
## 2 M 0.50 0.50
## 3 M 0.38 0.62
## 4 M 0.62 0.38
## 5 M 0.74 0.26
## 6 M 0.90 0.10
## 7 M 0.74 0.26
## 8 M 0.70 0.30
## 9 M 0.90 0.10
## 10 M 0.96 0.04
## 11 M 0.78 0.22
## 12 M 0.92 0.08
## 13 M 0.96 0.04
## 14 M 0.80 0.20
## 15 M 0.92 0.08
## 16 M 0.70 0.30
## 17 M 0.94 0.06
## 18 M 0.64 0.36
## 19 M 0.78 0.22
## 20 M 0.88 0.12
## 21 M 0.34 0.66
## 22 M 0.82 0.18
## 23 M 0.88 0.12
## 24 M 0.62 0.38
## 25 M 1.00 0.00
## 26 M 0.90 0.10
## 27 M 0.98 0.02
## 28 M 0.98 0.02
## 29 M 0.80 0.20
## 30 M 0.98 0.02
## 31 M 0.80 0.20
## 32 M 0.92 0.08
## 33 M 0.44 0.56
## 34 R 0.70 0.30
## 35 R 0.48 0.52
## 36 R 0.66 0.34
## 37 R 0.80 0.20
## 38 R 0.36 0.64
## 39 R 0.14 0.86
## 40 R 0.90 0.10
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_SMOTE_IN_Test$BTREE_Observed,
BTREE_SMOTE_IN_Test_ROC predictor = BTREE_SMOTE_IN_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_SMOTE_IN_Test$BTREE_Observed)))
<- auc(BTREE_SMOTE_IN_Test_ROC)[1]) (BTREE_SMOTE_IN_Test_ROCCurveAUC
## [1] 0.7489177
##################################
# Creating consistent fold assignments
# for the Repeated Cross Validation process
##################################
set.seed(12345678)
<- trainControl(method = "repeatedcv",
RepeatedCV_Control repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = "rose")
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
##################################
# Running the bagged trees model
# by setting the caret method to 'treebag'
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_ROSE_IN set.seed(12345678)
<- train(x = PMA_PreModelling_Train_ROSE_IN[,!names(PMA_PreModelling_Train_ROSE_IN) %in% c("Class")],
BTREE_ROSE_IN y = PMA_PreModelling_Train_ROSE_IN$Class,
method = "treebag",
nbagg = 50,
metric = "ROC",
trControl = RepeatedCV_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
BTREE_ROSE_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using ROSE
##
## Resampling results:
##
## ROC Sens Spec
## 0.7966071 0.7564286 0.58
$finalModel BTREE_ROSE_IN
##
## Bagging classification trees with 50 bootstrap replications
$results BTREE_ROSE_IN
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.7966071 0.7564286 0.58 0.1681211 0.1811677 0.3958973
<- BTREE_ROSE_IN$results$ROC) (BTREE_ROSE_IN_Train_ROCCurveAUC
## [1] 0.7966071
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(BTREE_ROSE_IN, scale = TRUE)
BTREE_ROSE_IN_VarImp plot(BTREE_ROSE_IN_VarImp,
top=25,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Bagged Trees (ROSE_IN)",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(BTREE_Observed = PMA_PreModelling_Test$Class,
BTREE_ROSE_IN_Test BTREE_Predicted = predict(BTREE_ROSE_IN,
!names(PMA_PreModelling_Test) %in% c("Class")],
PMA_PreModelling_Test[,type = "prob"))
BTREE_ROSE_IN_Test
## BTREE_Observed BTREE_Predicted.M BTREE_Predicted.R
## 1 M 0.66 0.34
## 2 M 0.66 0.34
## 3 M 0.70 0.30
## 4 M 0.90 0.10
## 5 M 0.76 0.24
## 6 M 0.92 0.08
## 7 M 0.78 0.22
## 8 M 0.88 0.12
## 9 M 0.76 0.24
## 10 M 0.84 0.16
## 11 M 0.98 0.02
## 12 M 0.94 0.06
## 13 M 0.96 0.04
## 14 M 0.72 0.28
## 15 M 0.74 0.26
## 16 M 0.68 0.32
## 17 M 0.84 0.16
## 18 M 0.56 0.44
## 19 M 0.70 0.30
## 20 M 0.64 0.36
## 21 M 0.92 0.08
## 22 M 0.90 0.10
## 23 M 0.76 0.24
## 24 M 0.72 0.28
## 25 M 0.96 0.04
## 26 M 0.82 0.18
## 27 M 0.76 0.24
## 28 M 0.96 0.04
## 29 M 0.90 0.10
## 30 M 0.94 0.06
## 31 M 0.86 0.14
## 32 M 0.82 0.18
## 33 M 0.68 0.32
## 34 R 0.56 0.44
## 35 R 0.70 0.30
## 36 R 0.80 0.20
## 37 R 0.86 0.14
## 38 R 0.74 0.26
## 39 R 0.34 0.66
## 40 R 0.78 0.22
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = BTREE_ROSE_IN_Test$BTREE_Observed,
BTREE_ROSE_IN_Test_ROC predictor = BTREE_ROSE_IN_Test$BTREE_Predicted.R,
levels = rev(levels(BTREE_ROSE_IN_Test$BTREE_Observed)))
<- auc(BTREE_ROSE_IN_Test_ROC)[1]) (BTREE_ROSE_IN_Test_ROCCurveAUC
## [1] 0.6969697
##################################
# Consolidating all evaluation results
# for the train and test sets
# using the AUROC metric
##################################
<- c('REF','RU_OUT','RO_OUT','SMOTE_OUT','ROSE_OUT','RU_IN','RO_IN','SMOTE_IN','ROSE_IN',
Model 'REF','RU_OUT','RO_OUT','SMOTE_OUT','ROSE_OUT','RU_IN','RO_IN','SMOTE_IN','ROSE_IN')
<- c(rep('Cross-Validation',9),rep('Test',9))
Set
<- c(BTREE_REF_Train_ROCCurveAUC,
ROCCurveAUC
BTREE_RU_OUT_Train_ROCCurveAUC,
BTREE_RO_OUT_Train_ROCCurveAUC,
BTREE_SMOTE_OUT_Train_ROCCurveAUC,
BTREE_ROSE_OUT_Train_ROCCurveAUC,
BTREE_RU_IN_Train_ROCCurveAUC,
BTREE_RO_IN_Train_ROCCurveAUC,
BTREE_SMOTE_IN_Train_ROCCurveAUC,
BTREE_ROSE_IN_Train_ROCCurveAUC,
BTREE_REF_Test_ROCCurveAUC,
BTREE_RU_OUT_Test_ROCCurveAUC,
BTREE_RO_OUT_Test_ROCCurveAUC,
BTREE_SMOTE_OUT_Test_ROCCurveAUC,
BTREE_ROSE_OUT_Test_ROCCurveAUC,
BTREE_RU_IN_Test_ROCCurveAUC,
BTREE_RO_IN_Test_ROCCurveAUC,
BTREE_SMOTE_IN_Test_ROCCurveAUC,
BTREE_ROSE_IN_Test_ROCCurveAUC)
<- as.data.frame(cbind(Model,Set,ROCCurveAUC))
ROCCurveAUC_Summary
$ROCCurveAUC <- as.numeric(as.character(ROCCurveAUC_Summary$ROCCurveAUC))
ROCCurveAUC_Summary$Set <- factor(ROCCurveAUC_Summary$Set,
ROCCurveAUC_Summarylevels = c("Cross-Validation",
"Test"))
$Model <- factor(ROCCurveAUC_Summary$Model,
ROCCurveAUC_Summarylevels =c('REF',
'RU_OUT',
'RO_OUT',
'SMOTE_OUT',
'ROSE_OUT',
'RU_IN',
'RO_IN',
'SMOTE_IN',
'ROSE_IN'))
print(ROCCurveAUC_Summary, row.names=FALSE)
## Model Set ROCCurveAUC
## REF Cross-Validation 0.7815179
## RU_OUT Cross-Validation 0.8200000
## RO_OUT Cross-Validation 1.0000000
## SMOTE_OUT Cross-Validation 0.9911548
## ROSE_OUT Cross-Validation 0.8637250
## RU_IN Cross-Validation 0.7908929
## RO_IN Cross-Validation 0.8607143
## SMOTE_IN Cross-Validation 0.8502679
## ROSE_IN Cross-Validation 0.7966071
## REF Test 0.6406926
## RU_OUT Test 0.9090909
## RO_OUT Test 0.6580087
## SMOTE_OUT Test 0.7164502
## ROSE_OUT Test 0.8008658
## RU_IN Test 0.8874459
## RO_IN Test 0.7272727
## SMOTE_IN Test 0.7489177
## ROSE_IN Test 0.6969697
<- dotplot(Model ~ ROCCurveAUC,
(ROCCurveAUC_Plot data = ROCCurveAUC_Summary,
groups = Set,
main = "Classification Model Performance Comparison",
ylab = "Model",
xlab = "AUROC",
auto.key = list(adj=1, space="top", columns=2),
type=c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))
##################################
# Consolidating the resampling results
# for the candidate models
##################################
<- resamples(list(REF = BTREE_REF,
(BTREE_RESAMPLING RU_OUT = BTREE_RU_OUT,
RO_OUT = BTREE_RO_OUT,
SMOTE_OUT = BTREE_SMOTE_OUT,
ROSE_OUT = BTREE_ROSE_OUT,
RU_IN = BTREE_RU_IN,
RO_IN = BTREE_RO_IN,
SMOTE_IN = BTREE_SMOTE_IN,
ROSE_IN = BTREE_ROSE_IN)))
##
## Call:
## resamples.default(x = list(REF = BTREE_REF, RU_OUT = BTREE_RU_OUT, RO_OUT
## RU_IN = BTREE_RU_IN, RO_IN = BTREE_RO_IN, SMOTE_IN = BTREE_SMOTE_IN, ROSE_IN
## = BTREE_ROSE_IN))
##
## Models: REF, RU_OUT, RO_OUT, SMOTE_OUT, ROSE_OUT, RU_IN, RO_IN, SMOTE_IN, ROSE_IN
## Number of resamples: 50
## Performance metrics: ROC, Sens, Spec
## Time estimates for: everything, final model fit
summary(BTREE_RESAMPLING)
##
## Call:
## summary.resamples(object = BTREE_RESAMPLING)
##
## Models: REF, RU_OUT, RO_OUT, SMOTE_OUT, ROSE_OUT, RU_IN, RO_IN, SMOTE_IN, ROSE_IN
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## REF 0.1250000 0.6462054 0.8281250 0.7815179 1.0000000 1 0
## RU_OUT 0.0000000 0.6250000 1.0000000 0.8200000 1.0000000 1 0
## RO_OUT 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1 0
## SMOTE_OUT 0.9285714 1.0000000 1.0000000 0.9911548 1.0000000 1 0
## ROSE_OUT 0.5600000 0.7803125 0.9000000 0.8637250 0.9600000 1 0
## RU_IN 0.0000000 0.6875000 0.8750000 0.7908929 1.0000000 1 0
## RO_IN 0.2500000 0.7924107 0.9218750 0.8607143 1.0000000 1 0
## SMOTE_IN 0.2500000 0.7500000 0.9375000 0.8502679 1.0000000 1 0
## ROSE_IN 0.3125000 0.7154018 0.7991071 0.7966071 0.9352679 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## REF 0.7142857 1.0000000 1.00 0.9571429 1.000 1 0
## RU_OUT 0.0000000 0.5000000 1.00 0.7400000 1.000 1 0
## RO_OUT 0.7500000 1.0000000 1.00 0.9764286 1.000 1 0
## SMOTE_OUT 0.7142857 0.8571429 1.00 0.9367857 1.000 1 0
## ROSE_OUT 0.2000000 0.6000000 0.80 0.7620000 1.000 1 0
## RU_IN 0.3750000 0.6250000 0.75 0.7496429 0.875 1 0
## RO_IN 0.7500000 1.0000000 1.00 0.9682143 1.000 1 0
## SMOTE_IN 0.7500000 0.8750000 1.00 0.9596429 1.000 1 0
## ROSE_IN 0.2500000 0.6250000 0.75 0.7564286 0.875 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## REF 0.0 0.0 0.50 0.3300000 0.500 1 0
## RU_OUT 0.0 0.5 0.50 0.6500000 1.000 1 0
## RO_OUT 1.0 1.0 1.00 1.0000000 1.000 1 0
## SMOTE_OUT 0.5 1.0 1.00 0.9573333 1.000 1 0
## ROSE_OUT 0.4 0.6 0.80 0.7400000 0.950 1 0
## RU_IN 0.0 0.5 0.75 0.6900000 1.000 1 0
## RO_IN 0.0 0.0 0.50 0.4400000 0.875 1 0
## SMOTE_IN 0.0 0.0 0.50 0.4600000 1.000 1 0
## ROSE_IN 0.0 0.5 0.50 0.5800000 1.000 1 0
##################################
# Exploring the resampling results
##################################
bwplot(BTREE_RESAMPLING,
main = "Model Resampling Performance Comparison (Range)",
ylab = "Model",
pch=16,
cex=2,
layout=c(3,1))
dotplot(BTREE_RESAMPLING,
main = "Model Resampling Performance Comparison (95% Confidence Interval)",
ylab = "Model",
pch=16,
cex=2,
layout=c(3,1))
##################################
# Consolidating all models
##################################
<- (list(REF = BTREE_REF,
(BTREE_MODELS RU_OUT = BTREE_RU_OUT,
RO_OUT = BTREE_RO_OUT,
SMOTE_OUT = BTREE_SMOTE_OUT,
ROSE_OUT = BTREE_ROSE_OUT,
RU_IN = BTREE_RU_IN,
RO_IN = BTREE_RO_IN,
SMOTE_IN = BTREE_SMOTE_IN,
ROSE_IN = BTREE_ROSE_IN)))
## $REF
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7815179 0.9571429 0.33
##
##
## $RU_OUT
## Bagged CART
##
## 36 samples
## 59 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 33, 32, 32, 33, 33, 32, ...
## Resampling results:
##
## ROC Sens Spec
## 0.82 0.74 0.65
##
##
## $RO_OUT
## Bagged CART
##
## 156 samples
## 59 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 140, 141, 140, 141, 141, 140, ...
## Resampling results:
##
## ROC Sens Spec
## 1 0.9764286 1
##
##
## $SMOTE_OUT
## Bagged CART
##
## 126 samples
## 60 predictor
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 113, 113, 114, 113, 114, 113, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9911548 0.9367857 0.9573333
##
##
## $ROSE_OUT
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 87, 86, 86, 87, 87, 86, ...
## Resampling results:
##
## ROC Sens Spec
## 0.863725 0.762 0.74
##
##
## $RU_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using down-sampling
##
## Resampling results:
##
## ROC Sens Spec
## 0.7908929 0.7496429 0.69
##
##
## $RO_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## ROC Sens Spec
## 0.8607143 0.9682143 0.44
##
##
## $SMOTE_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using SMOTE
##
## Resampling results:
##
## ROC Sens Spec
## 0.8502679 0.9596429 0.46
##
##
## $ROSE_IN
## Bagged CART
##
## 96 samples
## 60 predictors
## 2 classes: 'M', 'R'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 86, 87, 86, 87, 87, 86, ...
## Addtional sampling using ROSE
##
## Resampling results:
##
## ROC Sens Spec
## 0.7966071 0.7564286 0.58
##################################
# Creating a function model performance
# on test data
##################################
<- function(model, data) {
BTREE_TEST_ROCCurveAUC <- roc(data$Class,
ROCCurveAUC predict(model, data, type = "prob")[, "R"],
levels = c("M", "R"))
ci(ROCCurveAUC)
}
<- lapply(BTREE_MODELS,
BTREE_TEST_SUMMARY
BTREE_TEST_ROCCurveAUC, data = PMA_PreModelling_Test)
<- lapply(BTREE_TEST_SUMMARY, as.vector)
BTREE_TEST_SUMMARY <- do.call("rbind", BTREE_TEST_SUMMARY)
BTREE_TEST_SUMMARY colnames(BTREE_TEST_SUMMARY) <- c("LCL", "ROC", "UCL")
<- as.data.frame(BTREE_TEST_SUMMARY)) (BTREE_TEST_SUMMARY
## LCL ROC UCL
## REF 0.3977874 0.6406926 0.8835978
## RU_OUT 0.8187250 0.9090909 0.9994568
## RO_OUT 0.4325914 0.6580087 0.8834259
## SMOTE_OUT 0.5227796 0.7164502 0.9101208
## ROSE_OUT 0.6258540 0.8008658 0.9758776
## RU_IN 0.7813236 0.8874459 0.9935682
## RO_IN 0.5432332 0.7272727 0.9113123
## SMOTE_IN 0.5506533 0.7489177 0.9471822
## ROSE_IN 0.4924561 0.6969697 0.9014833