##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR2)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
library(nnet)
library(elasticnet)
library(earth)
library(party)
library(kernlab)
library(randomForest)
library(Cubist)
library(pROC)
library(mda)
library(klaR)
library(pamr)
##################################
# Loading source and
# formulating the train set
##################################
data(solubility)
<- as.data.frame(cbind(solTrainY,solTrainX))
Solubility_Train <- as.data.frame(cbind(solTestY,solTestX))
Solubility_Test
##################################
# Applying dichotomization and
# defining the response variable
##################################
$Log_Solubility_Class <- ifelse(Solubility_Train$solTrainY<mean(Solubility_Train$solTrainY),
Solubility_Train"Low","High")
$Log_Solubility_Class <- factor(Solubility_Train$Log_Solubility_Class,
Solubility_Trainlevels = c("Low","High"))
$Log_Solubility_Class <- ifelse(Solubility_Test$solTestY<mean(Solubility_Train$solTrainY),
Solubility_Test"Low","High")
$Log_Solubility_Class <- factor(Solubility_Test$Log_Solubility_Class,
Solubility_Testlevels = c("Low","High"))
$solTrainY <- NULL
Solubility_Train$solTestY <- NULL
Solubility_Test
##################################
# Filtering in a subset of variables
# for the analysis
##################################
<- Solubility_Train[,c("HydrophilicFactor",
Solubility_Train "NumAtoms",
"NumNonHAtoms",
"NumCarbon",
"Log_Solubility_Class")]
<- Solubility_Test[,c("HydrophilicFactor",
Solubility_Test "NumAtoms",
"NumNonHAtoms",
"NumCarbon",
"Log_Solubility_Class")]
##################################
# Performing a general exploration of the train set
##################################
dim(Solubility_Train)
## [1] 951 5
str(Solubility_Train)
## 'data.frame': 951 obs. of 5 variables:
## $ HydrophilicFactor : num -0.856 -0.37 -0.33 -0.96 -0.069 -0.651 -0.729 -0.835 0.194 0.353 ...
## $ NumAtoms : int 28 49 33 26 31 32 35 38 56 37 ...
## $ NumNonHAtoms : int 16 26 15 10 15 15 23 14 27 17 ...
## $ NumCarbon : int 14 21 13 10 9 10 17 12 22 14 ...
## $ Log_Solubility_Class: Factor w/ 2 levels "Low","High": 1 1 1 1 1 1 1 1 1 1 ...
summary(Solubility_Train)
## HydrophilicFactor NumAtoms NumNonHAtoms NumCarbon
## Min. :-0.98500 Min. : 5.00 Min. : 2.00 Min. : 1.000
## 1st Qu.:-0.76300 1st Qu.:17.00 1st Qu.: 8.00 1st Qu.: 6.000
## Median :-0.31400 Median :22.00 Median :12.00 Median : 9.000
## Mean :-0.02059 Mean :25.51 Mean :13.16 Mean : 9.893
## 3rd Qu.: 0.31300 3rd Qu.:31.00 3rd Qu.:17.00 3rd Qu.:12.000
## Max. :13.48300 Max. :94.00 Max. :47.00 Max. :33.000
## Log_Solubility_Class
## Low :427
## High:524
##
##
##
##
##################################
# Performing a general exploration of the test set
##################################
dim(Solubility_Test)
## [1] 316 5
str(Solubility_Test)
## 'data.frame': 316 obs. of 5 variables:
## $ HydrophilicFactor : num 0.492 1.317 0.846 0.984 0.843 ...
## $ NumAtoms : int 8 13 14 19 15 8 8 13 13 17 ...
## $ NumNonHAtoms : int 5 6 8 7 9 4 4 5 5 8 ...
## $ NumCarbon : int 2 3 6 5 6 2 3 4 4 7 ...
## $ Log_Solubility_Class: Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
summary(Solubility_Test)
## HydrophilicFactor NumAtoms NumNonHAtoms NumCarbon
## Min. :-0.9860 Min. : 5.0 Min. : 3.00 Min. : 1.000
## 1st Qu.:-0.7670 1st Qu.:17.0 1st Qu.: 8.00 1st Qu.: 6.000
## Median :-0.3970 Median :22.0 Median :11.00 Median : 8.000
## Mean :-0.1022 Mean :24.6 Mean :12.71 Mean : 9.785
## 3rd Qu.: 0.2140 3rd Qu.:29.0 3rd Qu.:16.00 3rd Qu.:12.000
## Max. : 5.0000 Max. :68.0 Max. :33.00 Max. :24.000
## Log_Solubility_Class
## Low :143
## High:173
##
##
##
##
##################################
# Formulating a data type assessment summary
##################################
<- Solubility_Train
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 HydrophilicFactor numeric
## 2 2 NumAtoms integer
## 3 3 NumNonHAtoms integer
## 4 4 NumCarbon integer
## 5 5 Log_Solubility_Class factor
##################################
# Loading dataset
##################################
<- Solubility_Train
DQA
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 HydrophilicFactor numeric 951 0 1.000
## 2 2 NumAtoms integer 951 0 1.000
## 3 3 NumNonHAtoms integer 951 0 1.000
## 4 4 NumCarbon integer 951 0 1.000
## 5 5 Log_Solubility_Class factor 951 0 1.000
##################################
# Listing all predictors
##################################
<- DQA[,!names(DQA) %in% c("Log_Solubility_Class")]
DQA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 4 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 HydrophilicFactor numeric 369 0.388
## 2 NumAtoms integer 66 0.069
## 3 NumNonHAtoms integer 36 0.038
## 4 NumCarbon integer 28 0.029
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 -0.828 -0.158 21 20
## 2 22.000 24.000 73 51
## 3 8.000 11.000 104 73
## 4 6.000 7.000 105 97
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## 2 1.431 5.000 25.507 22.000 94.000 1.364 5.523
## 3 1.425 2.000 13.161 12.000 47.000 0.993 4.129
## 4 1.082 1.000 9.893 9.000 33.000 0.927 3.616
## Percentile25th Percentile75th
## 1 -0.763 0.313
## 2 17.000 31.000
## 3 8.000 17.000
## 4 6.000 12.000
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 1 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio
## 1 HydrophilicFactor numeric 369 0.388
## First.Mode.Value Second.Mode.Value First.Mode.Count Second.Mode.Count
## 1 -0.828 -0.158 21 20
## First.Second.Mode.Ratio Minimum Mean Median Maximum Skewness Kurtosis
## 1 1.050 -0.985 -0.021 -0.314 13.483 3.404 27.504
## Percentile25th Percentile75th
## 1 -0.763 0.313
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Log_Solubility_Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA.Predictors.Numeric[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "4 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 951 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
HydrophilicFactor | 0 | 1 | -0.02 | 1.13 | -0.98 | -0.76 | -0.31 | 0.31 | 13.48 | ▇▁▁▁▁ |
NumAtoms | 0 | 1 | 25.51 | 12.61 | 5.00 | 17.00 | 22.00 | 31.00 | 94.00 | ▇▆▂▁▁ |
NumNonHAtoms | 0 | 1 | 13.16 | 6.50 | 2.00 | 8.00 | 12.00 | 17.00 | 47.00 | ▇▆▂▁▁ |
NumCarbon | 0 | 1 | 9.89 | 5.29 | 1.00 | 6.00 | 9.00 | 12.00 | 33.00 | ▇▇▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 951 4
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA)) (DPA_Skimmed
Name | DPA |
Number of rows | 951 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Log_Solubility_Class | 0 | 1 | FALSE | 2 | Hig: 524, Low: 427 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
HydrophilicFactor | 0 | 1 | -0.02 | 1.13 | -0.98 | -0.76 | -0.31 | 0.31 | 13.48 | ▇▁▁▁▁ |
NumAtoms | 0 | 1 | 25.51 | 12.61 | 5.00 | 17.00 | 22.00 | 31.00 | 94.00 | ▇▆▂▁▁ |
NumNonHAtoms | 0 | 1 | 13.16 | 6.50 | 2.00 | 8.00 | 12.00 | 17.00 | 47.00 | ▇▆▂▁▁ |
NumCarbon | 0 | 1 | 9.89 | 5.29 | 1.00 | 6.00 | 9.00 | 12.00 | 33.00 | ▇▇▃▁▁ |
##################################
# Identifying columns with low variance
###################################
<- nearZeroVar(DPA,
DPA_LowVariance freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
$nzv,]) (DPA_LowVariance[DPA_LowVariance
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
else {
}
print(paste0("Low variance observed for ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
<- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
DPA_LowVarianceForRemoval
print(paste0("Low variance can be resolved by removing ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
<- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
DPA_LowVarianceRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LowVarianceRemovedVariable))
}
%>%
DPA skim() %>%
::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
dplyr
##################################
# Filtering out columns with low variance
#################################
<- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLowVariance))
(DPA_ExcludedLowVariance_Skimmed }
## [1] "No low variance predictors noted."
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Log_Solubility_Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Visualizing pairwise correlation between predictors
##################################
<- cor.mtest(DPA.Predictors.Numeric,
DPA_CorrelationTest method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")
##################################
# Identifying the highly correlated variables
##################################
<- cor(DPA.Predictors.Numeric,
DPA_Correlation method = "pearson",
use="pairwise.complete.obs")
<- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95)) (DPA_HighlyCorrelatedCount
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
else {
} print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount)," pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
<- corr_cross(DPA.Predictors.Numeric,
(DPA_HighlyCorrelatedPairs max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
<- findCorrelation(DPA_Correlation, cutoff = 0.95)
DPA_HighlyCorrelated
<- length(DPA_HighlyCorrelated))
(DPA_HighlyCorrelatedForRemoval
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval)," numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
DPA_HighlyCorrelatedRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
<- DPA[,-DPA_HighlyCorrelated]
DPA_ExcludedHighCorrelation
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedHighCorrelation))
(DPA_ExcludedHighCorrelation_Skimmed
}
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Log_Solubility_Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying the linearly dependent variables
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$linearCombos)) (DPA_LinearlyDependentCount
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
else {
} print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount)," subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
DPA_LinearlyDependentSubset print(paste0("Linear dependent variable(s) for subset ",
i," include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$remove)
DPA_LinearlyDependentForRemoval
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval)," numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
DPA_LinearlyDependentRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
<- DPA[,-DPA_LinearlyDependent$remove]
DPA_ExcludedLinearlyDependent
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLinearlyDependent))
(DPA_ExcludedLinearlyDependent_Skimmed
}
##################################
# Creating the pre-modelling
# train set
##################################
<- Solubility_Train
PMA_PreModelling_Train
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling_Train)) (PMA_PreModelling_Train_Skimmed
Name | PMA_PreModelling_Train |
Number of rows | 951 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Log_Solubility_Class | 0 | 1 | FALSE | 2 | Hig: 524, Low: 427 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
HydrophilicFactor | 0 | 1 | -0.02 | 1.13 | -0.98 | -0.76 | -0.31 | 0.31 | 13.48 | ▇▁▁▁▁ |
NumAtoms | 0 | 1 | 25.51 | 12.61 | 5.00 | 17.00 | 22.00 | 31.00 | 94.00 | ▇▆▂▁▁ |
NumNonHAtoms | 0 | 1 | 13.16 | 6.50 | 2.00 | 8.00 | 12.00 | 17.00 | 47.00 | ▇▆▂▁▁ |
NumCarbon | 0 | 1 | 9.89 | 5.29 | 1.00 | 6.00 | 9.00 | 12.00 | 33.00 | ▇▇▃▁▁ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 951 5
##################################
# Creating the pre-modelling
# test set
##################################
<- Solubility_Test
PMA_PreModelling_Test
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling_Test)) (PMA_PreModelling_Test_Skimmed
Name | PMA_PreModelling_Test |
Number of rows | 316 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Log_Solubility_Class | 0 | 1 | FALSE | 2 | Hig: 173, Low: 143 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
HydrophilicFactor | 0 | 1 | -0.10 | 0.98 | -0.99 | -0.77 | -0.4 | 0.21 | 5 | ▇▂▁▁▁ |
NumAtoms | 0 | 1 | 24.60 | 11.89 | 5.00 | 17.00 | 22.0 | 29.00 | 68 | ▅▇▂▁▁ |
NumNonHAtoms | 0 | 1 | 12.71 | 6.01 | 3.00 | 8.00 | 11.0 | 16.00 | 33 | ▇▇▅▂▁ |
NumCarbon | 0 | 1 | 9.78 | 5.08 | 1.00 | 6.00 | 8.0 | 12.00 | 24 | ▃▇▃▂▁ |
###################################
# Verifying the data dimensions
# for the test set
###################################
dim(PMA_PreModelling_Test)
## [1] 316 5
##################################
# Loading dataset
##################################
<- PMA_PreModelling_Train
EDA
##################################
# Listing all predictors
##################################
<- EDA[,!names(EDA) %in% c("Log_Solubility_Class")]
EDA.Predictors
##################################
# Listing all numeric predictors
##################################
<- EDA.Predictors[,sapply(EDA.Predictors, is.numeric)]
EDA.Predictors.Numeric ncol(EDA.Predictors.Numeric)
## [1] 4
names(EDA.Predictors.Numeric)
## [1] "HydrophilicFactor" "NumAtoms" "NumNonHAtoms"
## [4] "NumCarbon"
##################################
# Formulating the box plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|")
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)],
y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)],
y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR[,!names(PMA_PreModelling_Train_LR) %in% c("Log_Solubility_Class")],
LR_Tune y = PMA_PreModelling_Train_LR$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8747542 0.7001107 0.8397678
$finalModel LR_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 4.04521 1.16804 0.05341 -0.28105
## NumCarbon
## -0.14277
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 834.6 AIC: 844.6
$results LR_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8747542 0.7001107 0.8397678 0.03649375 0.06155467 0.0638174
<- LR_Tune$results$ROC) (LR_Train_ROCCurveAUC
## [1] 0.8747542
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_Tune, scale = TRUE)
LR_VarImp plot(LR_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
##################################
# Formulating the box plots
##################################
featurePlot(x = PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)],
y = PMA_PreModelling_Test_LR$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)],
y = PMA_PreModelling_Test_LR$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_REF Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_Observed = PMA_PreModelling_Test_LR$Log_Solubility_Class,
LR_Test LR_Predicted = predict(LR_Tune,
!names(PMA_PreModelling_Test_LR) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR[,type = "prob"))
LR_Test
## LR_Observed LR_Predicted.Low LR_Predicted.High
## 20 High 0.0336865657 0.966313434
## 21 High 0.0153183414 0.984681659
## 23 High 0.0643972187 0.935602781
## 25 High 0.0285205288 0.971479471
## 28 High 0.0798066290 0.920193371
## 31 High 0.0718016713 0.928198329
## 32 High 0.1028972691 0.897102731
## 33 High 0.1274812074 0.872518793
## 34 High 0.1274812074 0.872518793
## 37 High 0.3233789907 0.676621009
## 38 High 0.3233789907 0.676621009
## 42 High 0.5594454651 0.440554535
## 49 High 0.2002171263 0.799782874
## 54 High 0.0684240570 0.931575943
## 55 High 0.0181095804 0.981890420
## 58 High 0.3886335524 0.611366448
## 60 High 0.1402991233 0.859700877
## 61 High 0.1274812074 0.872518793
## 65 High 0.0864344530 0.913565547
## 69 High 0.4637906536 0.536209346
## 73 High 0.0029275515 0.997072449
## 86 High 0.1127673204 0.887232680
## 90 High 0.0962059194 0.903794081
## 91 High 0.0047906462 0.995209354
## 93 High 0.0962059194 0.903794081
## 96 High 0.0047906462 0.995209354
## 98 High 0.1059035446 0.894096455
## 100 High 0.1131780647 0.886821935
## 104 High 0.9327897176 0.067210282
## 112 High 0.2328162785 0.767183721
## 115 High 0.8387113746 0.161288625
## 119 High 0.1836571689 0.816342831
## 128 High 0.1836571689 0.816342831
## 130 High 0.0190842020 0.980915798
## 139 High 0.0190842020 0.980915798
## 143 High 0.0563130978 0.943686902
## 145 High 0.1531943560 0.846805644
## 146 High 0.1836571689 0.816342831
## 149 High 0.1969074264 0.803092574
## 150 High 0.1674558482 0.832544152
## 152 High 0.1274812074 0.872518793
## 157 High 0.4426510503 0.557348950
## 161 High 0.2961059613 0.703894039
## 162 High 0.0061418555 0.993858144
## 166 High 0.4589405370 0.541059463
## 167 High 0.1909996198 0.809000380
## 173 High 0.1530869135 0.846913086
## 176 High 0.1836571689 0.816342831
## 182 High 0.0613936617 0.938606338
## 187 High 0.0441054546 0.955894545
## 190 High 0.0121865069 0.987813493
## 194 High 0.0576402753 0.942359725
## 195 High 0.2423869901 0.757613010
## 201 High 0.1307021191 0.869297881
## 207 High 0.1726530538 0.827346946
## 208 High 0.4277335519 0.572266448
## 215 High 0.0718016713 0.928198329
## 222 High 0.3047752395 0.695224760
## 224 High 0.2013625086 0.798637491
## 231 High 0.6523674422 0.347632558
## 236 High 0.1398934495 0.860106551
## 237 High 0.0890185417 0.910981458
## 240 High 0.2046581225 0.795341878
## 243 High 0.1307021191 0.869297881
## 248 High 0.2423869901 0.757613010
## 251 High 0.8035890142 0.196410986
## 256 High 0.4398434030 0.560156597
## 258 High 0.2233967470 0.776603253
## 262 High 0.4277335519 0.572266448
## 266 High 0.4628428824 0.537157118
## 272 High 0.4989075140 0.501092486
## 280 High 0.3289791156 0.671020884
## 283 High 0.3828696538 0.617130346
## 286 High 0.4495928586 0.550407141
## 287 High 0.2327138907 0.767286109
## 289 High 0.1292348977 0.870765102
## 290 High 0.3529766973 0.647023303
## 298 High 0.2935585665 0.706441434
## 305 High 0.3346274939 0.665372506
## 306 High 0.1894718660 0.810528134
## 312 High 0.1237091711 0.876290829
## 320 High 0.2893537234 0.710646277
## 325 High 0.2038752816 0.796124718
## 332 High 0.0802850496 0.919714950
## 333 High 0.3890415450 0.610958455
## 335 High 0.2935585665 0.706441434
## 339 High 0.6696503319 0.330349668
## 346 High 0.3986640566 0.601335943
## 347 High 0.0860863900 0.913913610
## 350 High 0.3277438405 0.672256160
## 353 High 0.3910406088 0.608959391
## 358 High 0.3733526855 0.626647315
## 365 High 0.2662005002 0.733799500
## 367 High 0.2182970608 0.781702939
## 370 High 0.0287403751 0.971259625
## 379 High 0.1042586676 0.895741332
## 386 High 0.3129499183 0.687050082
## 394 High 0.5395683832 0.460431617
## 396 High 0.1610214870 0.838978513
## 400 High 0.0890185417 0.910981458
## 404 High 0.0721585562 0.927841444
## 405 High 0.5330445369 0.466955463
## 413 High 0.1852494698 0.814750530
## 415 High 0.4082548314 0.591745169
## 417 High 0.2173598482 0.782640152
## 418 High 0.4530410351 0.546958965
## 423 High 0.2838563728 0.716143627
## 434 High 0.3397469664 0.660253034
## 437 High 0.2188160386 0.781183961
## 440 High 0.3550137284 0.644986272
## 449 High 0.4032385216 0.596761478
## 450 High 0.3044787370 0.695521263
## 457 High 0.4032385216 0.596761478
## 467 High 0.2827750579 0.717224942
## 469 High 0.0818798441 0.918120156
## 474 High 0.9175649757 0.082435024
## 475 High 0.8729656154 0.127034385
## 485 High 0.1139410564 0.886058944
## 504 Low 0.1708923019 0.829107698
## 511 Low 0.6509099069 0.349090093
## 512 Low 0.4402430307 0.559756969
## 517 Low 0.0275983874 0.972401613
## 519 Low 0.7833906241 0.216609376
## 520 Low 0.1216573714 0.878342629
## 522 Low 0.9204361063 0.079563894
## 527 Low 0.6497262336 0.350273766
## 528 Low 0.2689383818 0.731061618
## 529 Low 0.3420093611 0.657990639
## 537 Low 0.1601409934 0.839859007
## 540 Low 0.8991542777 0.100845722
## 541 Low 0.4407363999 0.559263600
## 547 Low 0.8997547622 0.100245238
## 550 Low 0.3840800947 0.615919905
## 555 Low 0.5881995873 0.411800413
## 564 Low 0.0792550489 0.920744951
## 570 Low 0.4185241373 0.581475863
## 573 Low 0.2767634485 0.723236551
## 575 Low 0.4407363999 0.559263600
## 578 Low 0.1997361134 0.800263887
## 581 Low 0.2767634485 0.723236551
## 585 Low 0.3394330523 0.660566948
## 590 Low 0.7555077932 0.244492207
## 601 Low 0.9234128549 0.076587145
## 602 Low 0.6369498782 0.363050122
## 607 Low 0.6234820573 0.376517943
## 610 Low 0.6108630426 0.389136957
## 618 Low 0.7753385721 0.224661428
## 624 Low 0.3394330523 0.660566948
## 626 Low 0.2798585686 0.720141431
## 627 Low 0.3204248034 0.679575197
## 634 Low 0.5983101185 0.401689881
## 640 Low 0.9898595316 0.010140468
## 642 Low 0.1832068271 0.816793173
## 643 Low 0.5081035274 0.491896473
## 644 Low 0.8116432284 0.188356772
## 645 Low 0.7002023850 0.299797615
## 646 Low 0.6507453243 0.349254676
## 647 Low 0.8301016048 0.169898395
## 652 Low 0.2054268163 0.794573184
## 658 Low 0.6607108311 0.339289169
## 659 Low 0.7510505757 0.248949424
## 660 Low 0.9161958942 0.083804106
## 664 Low 0.3408694824 0.659130518
## 666 Low 0.4192476727 0.580752327
## 667 Low 0.8681760956 0.131823904
## 675 Low 0.5081035274 0.491896473
## 680 Low 0.9926469280 0.007353072
## 681 Low 0.9166739748 0.083326025
## 687 Low 0.7797970867 0.220202913
## 694 Low 0.7788178093 0.221182191
## 697 Low 0.4799232251 0.520076775
## 701 Low 0.3033114910 0.696688509
## 705 Low 0.9802413586 0.019758641
## 707 Low 0.6893653986 0.310634601
## 710 Low 0.6311139386 0.368886061
## 716 Low 0.9129372667 0.087062733
## 719 Low 0.9266307028 0.073369297
## 720 Low 0.9743859647 0.025614035
## 725 Low 0.9786074660 0.021392534
## 727 Low 0.3033114910 0.696688509
## 730 Low 0.4471810591 0.552818941
## 738 Low 0.7769124419 0.223087558
## 745 Low 0.5748922115 0.425107789
## 748 Low 0.6952562425 0.304743758
## 751 Low 0.9647413723 0.035258628
## 756 Low 0.7139313964 0.286068604
## 766 Low 0.8661335675 0.133866432
## 769 Low 0.5137458692 0.486254131
## 783 Low 0.8026569527 0.197343047
## 785 Low 0.8373166692 0.162683331
## 790 Low 0.9240417860 0.075958214
## 793 Low 0.8026569527 0.197343047
## 795 Low 0.9935450908 0.006454909
## 796 Low 0.9835769734 0.016423027
## 797 Low 0.6437243296 0.356275670
## 801 Low 0.6958616855 0.304138315
## 811 Low 0.4691432530 0.530856747
## 812 Low 0.9617475976 0.038252402
## 815 Low 0.9376097372 0.062390263
## 816 Low 0.7074523261 0.292547674
## 817 Low 0.9234880845 0.076511916
## 824 Low 0.8349276989 0.165072301
## 825 Low 0.8349276989 0.165072301
## 826 Low 0.8349276989 0.165072301
## 830 Low 0.8513492063 0.148650794
## 837 Low 0.9069592513 0.093040749
## 838 Low 0.7074523261 0.292547674
## 844 Low 0.8819618824 0.118038118
## 845 Low 0.9748569182 0.025143082
## 847 Low 0.9174469864 0.082553014
## 850 Low 0.8637880842 0.136211916
## 852 Low 0.8888561225 0.111143878
## 853 Low 0.8888561225 0.111143878
## 861 Low 0.9101752188 0.089824781
## 868 Low 0.9795950024 0.020404998
## 874 Low 0.9281279972 0.071872003
## 879 High 0.1529569733 0.847043027
## 895 High 0.0463424895 0.953657511
## 899 High 0.0001178774 0.999882123
## 903 High 0.0463424895 0.953657511
## 917 High 0.0423986925 0.957601308
## 927 High 0.0684240570 0.931575943
## 929 High 0.1530869135 0.846913086
## 931 High 0.0684240570 0.931575943
## 933 High 0.4172979708 0.582702029
## 944 High 0.0720404149 0.927959585
## 947 High 0.0962059194 0.903794081
## 949 High 0.2223733535 0.777626646
## 953 High 0.0273184034 0.972681597
## 958 High 0.4446072150 0.555392785
## 961 High 0.0326363542 0.967363646
## 963 High 0.1566911276 0.843308872
## 964 High 0.1307021191 0.869297881
## 973 High 0.0688034909 0.931196509
## 976 High 0.0792550489 0.920744951
## 977 High 0.2423869901 0.757613010
## 980 High 0.2318664434 0.768133557
## 983 High 0.5516380511 0.448361949
## 984 High 0.2423869901 0.757613010
## 986 High 0.1237091711 0.876290829
## 989 High 0.2142824472 0.785717553
## 991 High 0.0363618409 0.963638159
## 996 High 0.0980702886 0.901929711
## 997 High 0.4019257860 0.598074214
## 999 High 0.0643972187 0.935602781
## 1000 High 0.0622947220 0.937705278
## 1003 High 0.0890185417 0.910981458
## 1008 High 0.1343419103 0.865658090
## 1009 High 0.3828696538 0.617130346
## 1014 High 0.0688164915 0.931183509
## 1015 High 0.5341066328 0.465893367
## 1040 High 0.2006168147 0.799383185
## 1042 High 0.3777393739 0.622260626
## 1043 High 0.6482206500 0.351779350
## 1050 High 0.1245951154 0.875404885
## 1052 High 0.2328000692 0.767199931
## 1056 High 0.0302052801 0.969794720
## 1070 High 0.5790424430 0.420957557
## 1073 High 0.4142959793 0.585704021
## 1074 High 0.1993893842 0.800610616
## 1079 High 0.3579641321 0.642035868
## 1080 High 0.4873669707 0.512633029
## 1085 High 0.1155517838 0.884448216
## 1087 High 0.6888614637 0.311138536
## 1096 High 0.9458132723 0.054186728
## 1099 High 0.5089195538 0.491080446
## 1100 High 0.6024802982 0.397519702
## 1102 High 0.0512009711 0.948799029
## 1107 Low 0.3749354744 0.625064526
## 1109 Low 0.7661888954 0.233811105
## 1114 Low 0.4002856069 0.599714393
## 1118 Low 0.4347382476 0.565261752
## 1123 Low 0.4690012962 0.530998704
## 1132 Low 0.8206401102 0.179359890
## 1134 Low 0.6312389825 0.368761018
## 1137 Low 0.3204248034 0.679575197
## 1154 Low 0.3204248034 0.679575197
## 1155 Low 0.6060432019 0.393956798
## 1157 Low 0.8609402930 0.139059707
## 1162 Low 0.5081035274 0.491896473
## 1164 Low 0.1832068271 0.816793173
## 1171 Low 0.9647413723 0.035258628
## 1172 Low 0.4426510503 0.557348950
## 1175 Low 0.7347521726 0.265247827
## 1177 Low 0.5669973713 0.433002629
## 1179 Low 0.9495401865 0.050459814
## 1183 Low 0.2484602589 0.751539741
## 1185 Low 0.9700991083 0.029900892
## 1189 Low 0.9353149148 0.064685085
## 1211 Low 0.7090081559 0.290991844
## 1218 Low 0.9962485352 0.003751465
## 1224 Low 0.3641947857 0.635805214
## 1225 Low 0.3033114910 0.696688509
## 1227 Low 0.9477186443 0.052281356
## 1232 Low 0.9906240752 0.009375925
## 1235 Low 0.7850592628 0.214940737
## 1238 Low 0.8244394114 0.175560589
## 1240 Low 0.8970123273 0.102987673
## 1241 Low 0.7090081559 0.290991844
## 1248 Low 0.8513492063 0.148650794
## 1258 Low 0.7074523261 0.292547674
## 1261 Low 0.8640693634 0.135930637
## 1263 Low 0.8349276989 0.165072301
## 1269 Low 0.9566595486 0.043340451
## 1270 Low 0.9911281573 0.008871843
## 1271 Low 0.8637880842 0.136211916
## 1272 Low 0.8888561225 0.111143878
## 1280 Low 0.9281279972 0.071872003
## 1286 Low 0.9890962672 0.010903733
## 1287 Low 0.9901901712 0.009809829
## 1289 Low 0.9548963849 0.045103615
## 1290 Low 0.8888561225 0.111143878
## 1291 High 0.2132741120 0.786725888
## 1294 High 0.7243151562 0.275684844
## 1305 Low 0.9635883387 0.036411661
## 1308 High 0.8468798982 0.153120102
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_Test$LR_Observed,
LR_Test_ROC predictor = LR_Test$LR_Predicted.High,
levels = rev(levels(LR_Test$LR_Observed)))
<- auc(LR_Test_ROC)[1]) (LR_Test_ROCCurveAUC
## [1] 0.8844739
##################################
# Adding an offset to adjust the
# range of values to only positive values
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- PMA_PreModelling_Train[,i]+1
PMA_PreModelling_Train[,i]
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
<- PMA_PreModelling_Test[,i]+1
PMA_PreModelling_Test[,i]
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- preProcess(PMA_PreModelling_Train_LR, method = c("BoxCox"))
Transform_BoxCox <- predict(Transform_BoxCox, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_BCT $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_BCT
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_BCT[,sapply(PMA_PreModelling_Train_LR_BCT, is.numeric)],
y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_BCT[,sapply(PMA_PreModelling_Train_LR_BCT, is.numeric)],
y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_BCT[,!names(PMA_PreModelling_Train_LR_BCT) %in% c("Log_Solubility_Class")],
LR_BCT_Tune y = PMA_PreModelling_Train_LR_BCT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_BCT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8887838 0.7281285 0.8283382
$finalModel LR_BCT_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 10.7676 1.3301 1.4474 -5.4764
## NumCarbon
## -0.1236
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 779.5 AIC: 789.5
$results LR_BCT_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8887838 0.7281285 0.8283382 0.03378421 0.07358024 0.06110159
<- LR_BCT_Tune$results$ROC) (LR_BCT_Train_ROCCurveAUC
## [1] 0.8887838
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_BCT_Tune, scale = TRUE)
LR_BCT_VarImp plot(LR_BCT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- predict(Transform_BoxCox, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_BCT $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_BCT
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_BCT[,sapply(PMA_PreModelling_Test_LR_BCT, is.numeric)],
y = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_BCT[,sapply(PMA_PreModelling_Test_LR_BCT, is.numeric)],
y = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_BCT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_BCT_Observed = PMA_PreModelling_Test_LR_BCT$Log_Solubility_Class,
LR_BCT_Test LR_BCT_Predicted = predict(LR_BCT_Tune,
!names(PMA_PreModelling_Test_LR_BCT) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_BCT[,type = "prob"))
LR_BCT_Test
## LR_BCT_Observed LR_BCT_Predicted.Low LR_BCT_Predicted.High
## 20 High 0.0106788478 0.989321152
## 21 High 0.0071362537 0.992863746
## 23 High 0.0393459886 0.960654011
## 25 High 0.0121378710 0.987862129
## 28 High 0.0624328061 0.937567194
## 31 High 0.0138721555 0.986127844
## 32 High 0.0247401461 0.975259854
## 33 High 0.0461679379 0.953832062
## 34 High 0.0461679379 0.953832062
## 37 High 0.3571115534 0.642888447
## 38 High 0.3571115534 0.642888447
## 42 High 0.4980215283 0.501978472
## 49 High 0.1438747853 0.856125215
## 54 High 0.0165462619 0.983453738
## 55 High 0.0166769554 0.983323045
## 58 High 0.4550864082 0.544913592
## 60 High 0.0600003953 0.939999605
## 61 High 0.0461679379 0.953832062
## 65 High 0.0447692681 0.955230732
## 69 High 0.5416238123 0.458376188
## 73 High 0.0197460856 0.980253914
## 86 High 0.0422336568 0.957766343
## 90 High 0.0321165091 0.967883491
## 91 High 0.0094919686 0.990508031
## 93 High 0.0321165091 0.967883491
## 96 High 0.0094919686 0.990508031
## 98 High 0.0366929536 0.963307046
## 100 High 0.0542108436 0.945789156
## 104 High 0.8776746097 0.122325390
## 112 High 0.1983421147 0.801657885
## 115 High 0.8831774718 0.116822528
## 119 High 0.1241227426 0.875877257
## 128 High 0.1241227426 0.875877257
## 130 High 0.0181786697 0.981821330
## 139 High 0.0181786697 0.981821330
## 143 High 0.0378887829 0.962111217
## 145 High 0.0730209400 0.926979060
## 146 High 0.1241227426 0.875877257
## 149 High 0.1481173988 0.851882601
## 150 High 0.0964081466 0.903591853
## 152 High 0.0461679379 0.953832062
## 157 High 0.5424164866 0.457583513
## 161 High 0.2169241823 0.783075818
## 162 High 0.0081207937 0.991879206
## 166 High 0.5350267240 0.464973276
## 167 High 0.1238179689 0.876182031
## 173 High 0.0832659662 0.916734034
## 176 High 0.1241227426 0.875877257
## 182 High 0.0051278105 0.994872189
## 187 High 0.0422067189 0.957793281
## 190 High 0.0180259773 0.981974023
## 194 High 0.0157103850 0.984289615
## 195 High 0.1420659840 0.857934016
## 201 High 0.0566739425 0.943326058
## 207 High 0.0923516972 0.907648303
## 208 High 0.5326871755 0.467312825
## 215 High 0.0138721555 0.986127844
## 222 High 0.2009338031 0.799066197
## 224 High 0.2021924279 0.797807572
## 231 High 0.7632973587 0.236702641
## 236 High 0.0948388987 0.905161101
## 237 High 0.0164527338 0.983547266
## 240 High 0.1992633468 0.800736653
## 243 High 0.0566739425 0.943326058
## 248 High 0.1420659840 0.857934016
## 251 High 0.7380831273 0.261916873
## 256 High 0.3579586896 0.642041310
## 258 High 0.1476808290 0.852319171
## 262 High 0.5326871755 0.467312825
## 266 High 0.4699388730 0.530061127
## 272 High 0.5770850405 0.422914959
## 280 High 0.3130800631 0.686919937
## 283 High 0.4439245125 0.556075487
## 286 High 0.4514948386 0.548505161
## 287 High 0.1336742035 0.866325796
## 289 High 0.1193414649 0.880658535
## 290 High 0.3335117710 0.666488229
## 298 High 0.3052569144 0.694743086
## 305 High 0.2800013287 0.719998671
## 306 High 0.2260049236 0.773995076
## 312 High 0.1032698105 0.896730190
## 320 High 0.3076680491 0.692331951
## 325 High 0.1842236116 0.815776388
## 332 High 0.0586131421 0.941386858
## 333 High 0.4769698970 0.523030103
## 335 High 0.3052569144 0.694743086
## 339 High 0.8166001784 0.183399822
## 346 High 0.2935888502 0.706411150
## 347 High 0.0320240534 0.967975947
## 350 High 0.4009958389 0.599004161
## 353 High 0.3131508028 0.686849197
## 358 High 0.2701478279 0.729852172
## 365 High 0.2156848824 0.784315118
## 367 High 0.2112425750 0.788757425
## 370 High 0.0509989361 0.949001064
## 379 High 0.1262417880 0.873758212
## 386 High 0.4443738084 0.555626192
## 394 High 0.5041699492 0.495830051
## 396 High 0.1311385627 0.868861437
## 400 High 0.0164527338 0.983547266
## 404 High 0.0393672403 0.960632760
## 405 High 0.6364127374 0.363587263
## 413 High 0.1147530425 0.885246957
## 415 High 0.3508766249 0.649123375
## 417 High 0.2248338620 0.775166138
## 418 High 0.4108293636 0.589170636
## 423 High 0.2838050184 0.716194982
## 434 High 0.2353807171 0.764619283
## 437 High 0.2990469896 0.700953010
## 440 High 0.3983575610 0.601642439
## 449 High 0.3740547041 0.625945296
## 450 High 0.2800405789 0.719959421
## 457 High 0.3740547041 0.625945296
## 467 High 0.3477870747 0.652212925
## 469 High 0.2411307699 0.758869230
## 474 High 0.9420572222 0.057942778
## 475 High 0.9065732264 0.093426774
## 485 High 0.1233535430 0.876646457
## 504 Low 0.2527906760 0.747209324
## 511 Low 0.6865391505 0.313460849
## 512 Low 0.4047166486 0.595283351
## 517 Low 0.0656989760 0.934301024
## 519 Low 0.6847887241 0.315211276
## 520 Low 0.0419643779 0.958035622
## 522 Low 0.9512569299 0.048743070
## 527 Low 0.5950600444 0.404939956
## 528 Low 0.3666971317 0.633302868
## 529 Low 0.3033258986 0.696674101
## 537 Low 0.0894377504 0.910562250
## 540 Low 0.9299392609 0.070060739
## 541 Low 0.6976605213 0.302339479
## 547 Low 0.9224917546 0.077508245
## 550 Low 0.6120118529 0.387988147
## 555 Low 0.4797523411 0.520247659
## 564 Low 0.0198374990 0.980162501
## 570 Low 0.3404075983 0.659592402
## 573 Low 0.2308988075 0.769101192
## 575 Low 0.6976605213 0.302339479
## 578 Low 0.1980215186 0.801978481
## 581 Low 0.2308988075 0.769101192
## 585 Low 0.3361800443 0.663819956
## 590 Low 0.7115152158 0.288484784
## 601 Low 0.7926652156 0.207334784
## 602 Low 0.6243017798 0.375698220
## 607 Low 0.5287426671 0.471257333
## 610 Low 0.5139990677 0.486000932
## 618 Low 0.7102111890 0.289788811
## 624 Low 0.3361800443 0.663819956
## 626 Low 0.2006616946 0.799338305
## 627 Low 0.3402550529 0.659744947
## 634 Low 0.6211336105 0.378866390
## 640 Low 0.9838055034 0.016194497
## 642 Low 0.1760170355 0.823982965
## 643 Low 0.7851585980 0.214841402
## 644 Low 0.8721155291 0.127884471
## 645 Low 0.7767636349 0.223236365
## 646 Low 0.8881478956 0.111852104
## 647 Low 0.7286976220 0.271302378
## 652 Low 0.1652311255 0.834768874
## 658 Low 0.5649909662 0.435009034
## 659 Low 0.9358548026 0.064145197
## 660 Low 0.8494685582 0.150531442
## 664 Low 0.3462913416 0.653708658
## 666 Low 0.4573944495 0.542605550
## 667 Low 0.8366650170 0.163334983
## 675 Low 0.7851585980 0.214841402
## 680 Low 0.9649799678 0.035020032
## 681 Low 0.8599408367 0.140059163
## 687 Low 0.9460336996 0.053966300
## 694 Low 0.7144664403 0.285533560
## 697 Low 0.5847904454 0.415209555
## 701 Low 0.4828825333 0.517117467
## 705 Low 0.9090396249 0.090960375
## 707 Low 0.6414060837 0.358593916
## 710 Low 0.7840992120 0.215900788
## 716 Low 0.8522709843 0.147729016
## 719 Low 0.9417254613 0.058274539
## 720 Low 0.9812037744 0.018796226
## 725 Low 0.9797183384 0.020281662
## 727 Low 0.4828825333 0.517117467
## 730 Low 0.5419442378 0.458055762
## 738 Low 0.8926103955 0.107389604
## 745 Low 0.8485016772 0.151498323
## 748 Low 0.8291228102 0.170877190
## 751 Low 0.9156090270 0.084390973
## 756 Low 0.6729571755 0.327042824
## 766 Low 0.7161626263 0.283837374
## 769 Low 0.6479837562 0.352016244
## 783 Low 0.8822621938 0.117737806
## 785 Low 0.9637029474 0.036297053
## 790 Low 0.8580730609 0.141926939
## 793 Low 0.8822621938 0.117737806
## 795 Low 0.9888028122 0.011197188
## 796 Low 0.9830892650 0.016910735
## 797 Low 0.5255268720 0.474473128
## 801 Low 0.7934333977 0.206566602
## 811 Low 0.5931980901 0.406801910
## 812 Low 0.9094790547 0.090520945
## 815 Low 0.9737933608 0.026206639
## 816 Low 0.7873874858 0.212612514
## 817 Low 0.9838411522 0.016158848
## 824 Low 0.8906357926 0.109364207
## 825 Low 0.8906357926 0.109364207
## 826 Low 0.8906357926 0.109364207
## 830 Low 0.9674044746 0.032595525
## 837 Low 0.9803346321 0.019665368
## 838 Low 0.7873874858 0.212612514
## 844 Low 0.9746818677 0.025318132
## 845 Low 0.9935410672 0.006458933
## 847 Low 0.9348514760 0.065148524
## 850 Low 0.9033168636 0.096683136
## 852 Low 0.9165077717 0.083492228
## 853 Low 0.9165077717 0.083492228
## 861 Low 0.9287872244 0.071212776
## 868 Low 0.9945213964 0.005478604
## 874 Low 0.9399255679 0.060074432
## 879 High 0.0820381758 0.917961824
## 895 High 0.0075429914 0.992457009
## 899 High 0.0003688008 0.999631199
## 903 High 0.0075429914 0.992457009
## 917 High 0.0512730090 0.948726991
## 927 High 0.0165462619 0.983453738
## 929 High 0.0832659662 0.916734034
## 931 High 0.0165462619 0.983453738
## 933 High 0.3637050472 0.636294953
## 944 High 0.0345478479 0.965452152
## 947 High 0.0321165091 0.967883491
## 949 High 0.1830929077 0.816907092
## 953 High 0.0247953563 0.975204644
## 958 High 0.5752647020 0.424735298
## 961 High 0.0114054875 0.988594513
## 963 High 0.1003959629 0.899604037
## 964 High 0.0566739425 0.943326058
## 973 High 0.0493627059 0.950637294
## 976 High 0.0198374990 0.980162501
## 977 High 0.1420659840 0.857934016
## 980 High 0.2975453104 0.702454690
## 983 High 0.6552675251 0.344732475
## 984 High 0.1420659840 0.857934016
## 986 High 0.1032698105 0.896730190
## 989 High 0.1752033347 0.824796665
## 991 High 0.0216816050 0.978318395
## 996 High 0.0213154376 0.978684562
## 997 High 0.4645359623 0.535464038
## 999 High 0.0393459886 0.960654011
## 1000 High 0.0558764682 0.944123532
## 1003 High 0.0164527338 0.983547266
## 1008 High 0.0923518450 0.907648155
## 1009 High 0.4439245125 0.556075487
## 1014 High 0.0345635445 0.965436455
## 1015 High 0.4759596868 0.524040313
## 1040 High 0.1881135515 0.811886448
## 1042 High 0.3560654524 0.643934548
## 1043 High 0.7350851706 0.264914829
## 1050 High 0.1128103317 0.887189668
## 1052 High 0.1590707746 0.840929225
## 1056 High 0.2601441055 0.739855895
## 1070 High 0.5365314855 0.463468514
## 1073 High 0.5067464368 0.493253563
## 1074 High 0.1359320993 0.864067901
## 1079 High 0.4166247667 0.583375233
## 1080 High 0.6521706797 0.347829320
## 1085 High 0.0914183321 0.908581668
## 1087 High 0.5954058977 0.404594102
## 1096 High 0.8884323804 0.111567620
## 1099 High 0.4356482012 0.564351799
## 1100 High 0.7147323089 0.285267691
## 1102 High 0.0756794028 0.924320597
## 1107 Low 0.5814501628 0.418549837
## 1109 Low 0.6608220214 0.339177979
## 1114 Low 0.6200404052 0.379959595
## 1118 Low 0.3315143332 0.668485667
## 1123 Low 0.4710123894 0.528987611
## 1132 Low 0.9166894611 0.083310539
## 1134 Low 0.5681675652 0.431832435
## 1137 Low 0.3402550529 0.659744947
## 1154 Low 0.3402550529 0.659744947
## 1155 Low 0.5465223806 0.453477619
## 1157 Low 0.7298873873 0.270112613
## 1162 Low 0.7851585980 0.214841402
## 1164 Low 0.1760170355 0.823982965
## 1171 Low 0.9156090270 0.084390973
## 1172 Low 0.5424164866 0.457583513
## 1175 Low 0.6006201592 0.399379841
## 1177 Low 0.7100702299 0.289929770
## 1179 Low 0.9122765249 0.087723475
## 1183 Low 0.3352874269 0.664712573
## 1185 Low 0.9229073005 0.077092699
## 1189 Low 0.7983456906 0.201654309
## 1211 Low 0.9189338153 0.081066185
## 1218 Low 0.9896125086 0.010387491
## 1224 Low 0.4190665046 0.580933495
## 1225 Low 0.4828825333 0.517117467
## 1227 Low 0.8952185285 0.104781471
## 1232 Low 0.9862080090 0.013791991
## 1235 Low 0.7011319853 0.298868015
## 1238 Low 0.6722498554 0.327750145
## 1240 Low 0.8403084604 0.159691540
## 1241 Low 0.9189338153 0.081066185
## 1248 Low 0.9674044746 0.032595525
## 1258 Low 0.7873874858 0.212612514
## 1261 Low 0.8966286234 0.103371377
## 1263 Low 0.8906357926 0.109364207
## 1269 Low 0.9567248742 0.043275126
## 1270 Low 0.9871950642 0.012804936
## 1271 Low 0.9033168636 0.096683136
## 1272 Low 0.9165077717 0.083492228
## 1280 Low 0.9399255679 0.060074432
## 1286 Low 0.9964564442 0.003543556
## 1287 Low 0.9967294062 0.003270594
## 1289 Low 0.9574638600 0.042536140
## 1290 Low 0.9165077717 0.083492228
## 1291 High 0.1740583738 0.825941626
## 1294 High 0.8061455726 0.193854427
## 1305 Low 0.8709712683 0.129028732
## 1308 High 0.7071099477 0.292890052
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_BCT_Test$LR_BCT_Observed,
LR_BCT_Test_ROC predictor = LR_BCT_Test$LR_BCT_Predicted.High,
levels = rev(levels(LR_BCT_Test$LR_BCT_Observed)))
<- auc(LR_BCT_Test_ROC)[1]) (LR_BCT_Test_ROCCurveAUC
## [1] 0.8967622
##################################
# Creating a local object
# for the train and test sets
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- preProcess(PMA_PreModelling_Train_LR, method = c("YeoJohnson"))
Transform_YeoJohnson <- predict(Transform_YeoJohnson, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_YJT $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_YJT
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_YJT[,sapply(PMA_PreModelling_Train_LR_YJT, is.numeric)],
y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_YJT[,sapply(PMA_PreModelling_Train_LR_YJT, is.numeric)],
y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_YJT[,!names(PMA_PreModelling_Train_LR_YJT) %in% c("Log_Solubility_Class")],
LR_YJT_Tune y = PMA_PreModelling_Train_LR_YJT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_YJT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8807066 0.7187154 0.8283019
$finalModel LR_YJT_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 9.3048 1.4325 1.2632 -4.0866
## NumCarbon
## -0.4926
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 806.5 AIC: 816.5
$results LR_YJT_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8807066 0.7187154 0.8283019 0.03522898 0.07917814 0.05907546
<- LR_YJT_Tune$results$ROC) (LR_YJT_Train_ROCCurveAUC
## [1] 0.8807066
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_YJT_Tune, scale = TRUE)
LR_YJT_VarImp plot(LR_YJT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- predict(Transform_YeoJohnson, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_YJT $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_YJT
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_YJT[,sapply(PMA_PreModelling_Test_LR_YJT, is.numeric)],
y = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_YJT[,sapply(PMA_PreModelling_Test_LR_YJT, is.numeric)],
y = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_YJT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_YJT_Observed = PMA_PreModelling_Test_LR_YJT$Log_Solubility_Class,
LR_YJT_Test LR_YJT_Predicted = predict(LR_YJT_Tune,
!names(PMA_PreModelling_Test_LR_YJT) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_YJT[,type = "prob"))
LR_YJT_Test
## LR_YJT_Observed LR_YJT_Predicted.Low LR_YJT_Predicted.High
## 20 High 0.0111852876 0.98881471
## 21 High 0.0086628052 0.99133719
## 23 High 0.0465501013 0.95344990
## 25 High 0.0150865233 0.98491348
## 28 High 0.0673498546 0.93265015
## 31 High 0.0187893105 0.98121069
## 32 High 0.0385145943 0.96148541
## 33 High 0.0651287215 0.93487128
## 34 High 0.0651287215 0.93487128
## 37 High 0.3795518012 0.62044820
## 38 High 0.3795518012 0.62044820
## 42 High 0.5083619918 0.49163801
## 49 High 0.1725513385 0.82744866
## 54 High 0.0211968888 0.97880311
## 55 High 0.0243796249 0.97562038
## 58 High 0.4639398237 0.53606018
## 60 High 0.0740667556 0.92593324
## 61 High 0.0651287215 0.93487128
## 65 High 0.0454691543 0.95453085
## 69 High 0.5509798165 0.44902018
## 73 High 0.0335583314 0.96644167
## 86 High 0.0553639019 0.94463610
## 90 High 0.0398746182 0.96012538
## 91 High 0.0177295306 0.98227047
## 93 High 0.0398746182 0.96012538
## 96 High 0.0177295306 0.98227047
## 98 High 0.0455680065 0.95443199
## 100 High 0.0607090959 0.93929090
## 104 High 0.8782141160 0.12178588
## 112 High 0.2280878343 0.77191217
## 115 High 0.8818973943 0.11810261
## 119 High 0.1495433520 0.85045665
## 128 High 0.1495433520 0.85045665
## 130 High 0.0265537930 0.97344621
## 139 High 0.0265537930 0.97344621
## 143 High 0.0408285223 0.95917148
## 145 High 0.1010816593 0.89891834
## 146 High 0.1495433520 0.85045665
## 149 High 0.1495628278 0.85043717
## 150 High 0.1223809375 0.87761906
## 152 High 0.0651287215 0.93487128
## 157 High 0.5324789498 0.46752105
## 161 High 0.2353887548 0.76461125
## 162 High 0.0108126332 0.98918737
## 166 High 0.5421256704 0.45787433
## 167 High 0.1557431213 0.84425688
## 173 High 0.1060669672 0.89393303
## 176 High 0.1495433520 0.85045665
## 182 High 0.0089571356 0.99104286
## 187 High 0.0559846274 0.94401537
## 190 High 0.0229926784 0.97700732
## 194 High 0.0164604888 0.98353951
## 195 High 0.1682281293 0.83177187
## 201 High 0.0687190029 0.93128100
## 207 High 0.1097517332 0.89024827
## 208 High 0.5386162406 0.46138376
## 215 High 0.0187893105 0.98121069
## 222 High 0.2334165404 0.76658346
## 224 High 0.1887353848 0.81126462
## 231 High 0.7667001343 0.23329987
## 236 High 0.0956712807 0.90432872
## 237 High 0.0259764904 0.97402351
## 240 High 0.1856240164 0.81437598
## 243 High 0.0687190029 0.93128100
## 248 High 0.1682281293 0.83177187
## 251 High 0.7507400098 0.24925999
## 256 High 0.3828163835 0.61718362
## 258 High 0.1615928275 0.83840717
## 262 High 0.5386162406 0.46138376
## 266 High 0.4414142561 0.55858574
## 272 High 0.5848712015 0.41512880
## 280 High 0.2943400032 0.70566000
## 283 High 0.4656557701 0.53434423
## 286 High 0.4230688180 0.57693118
## 287 High 0.1584876212 0.84151238
## 289 High 0.1001632377 0.89983676
## 290 High 0.3147070899 0.68529291
## 298 High 0.3331168358 0.66688316
## 305 High 0.2845465573 0.71545344
## 306 High 0.2377354619 0.76226454
## 312 High 0.1089806628 0.89101934
## 320 High 0.3282503800 0.67174962
## 325 High 0.1789329323 0.82106707
## 332 High 0.0616433564 0.93835664
## 333 High 0.4825359090 0.51746409
## 335 High 0.3331168358 0.66688316
## 339 High 0.7915253506 0.20847465
## 346 High 0.3337862922 0.66621371
## 347 High 0.0354544470 0.96454555
## 350 High 0.4051825670 0.59481743
## 353 High 0.3366565097 0.66334349
## 358 High 0.3080651712 0.69193483
## 365 High 0.2184421650 0.78155784
## 367 High 0.2078586791 0.79214132
## 370 High 0.0572003924 0.94279961
## 379 High 0.1413219999 0.85867800
## 386 High 0.3773059137 0.62269409
## 394 High 0.4977002808 0.50229972
## 396 High 0.1361516876 0.86384831
## 400 High 0.0259764904 0.97402351
## 404 High 0.0341686706 0.96583133
## 405 High 0.6483833587 0.35161664
## 413 High 0.1298373285 0.87016267
## 415 High 0.3574576992 0.64254230
## 417 High 0.2052920777 0.79470792
## 418 High 0.4180844835 0.58191552
## 423 High 0.3207522384 0.67924776
## 434 High 0.2720066738 0.72799333
## 437 High 0.2932273518 0.70677265
## 440 High 0.3919768236 0.60802318
## 449 High 0.3535353096 0.64646469
## 450 High 0.2675541417 0.73244586
## 457 High 0.3535353096 0.64646469
## 467 High 0.3416031366 0.65839686
## 469 High 0.2412064259 0.75879357
## 474 High 0.9372709045 0.06272910
## 475 High 0.9045788808 0.09542112
## 485 High 0.1400180092 0.85998199
## 504 Low 0.3014552746 0.69854473
## 511 Low 0.7986676581 0.20133234
## 512 Low 0.3948293950 0.60517061
## 517 Low 0.0915588732 0.90844113
## 519 Low 0.6710622179 0.32893778
## 520 Low 0.0593466688 0.94065333
## 522 Low 0.9427595075 0.05724049
## 527 Low 0.6066246995 0.39337530
## 528 Low 0.3047234793 0.69527652
## 529 Low 0.2940689781 0.70593102
## 537 Low 0.1137543356 0.88624566
## 540 Low 0.9257268536 0.07427315
## 541 Low 0.5764299193 0.42357008
## 547 Low 0.9213479392 0.07865206
## 550 Low 0.7714659594 0.22853404
## 555 Low 0.4823762799 0.51762372
## 564 Low 0.0264753239 0.97352468
## 570 Low 0.3415221433 0.65847786
## 573 Low 0.2337758471 0.76622415
## 575 Low 0.5764299193 0.42357008
## 578 Low 0.1801949699 0.81980503
## 581 Low 0.2337758471 0.76622415
## 585 Low 0.3222395378 0.67776046
## 590 Low 0.7000901095 0.29990989
## 601 Low 0.7965076159 0.20349238
## 602 Low 0.5911455454 0.40885445
## 607 Low 0.5745963409 0.42540366
## 610 Low 0.5598294027 0.44017060
## 618 Low 0.6718083378 0.32819166
## 624 Low 0.3222395378 0.67776046
## 626 Low 0.2300137304 0.76998627
## 627 Low 0.3796756641 0.62032434
## 634 Low 0.6772469161 0.32275308
## 640 Low 0.9826668299 0.01733317
## 642 Low 0.1596173546 0.84038265
## 643 Low 0.6603529106 0.33964709
## 644 Low 0.8722361193 0.12776388
## 645 Low 0.7805565258 0.21944347
## 646 Low 0.7944944044 0.20550560
## 647 Low 0.7214006575 0.27859934
## 652 Low 0.1909579025 0.80904210
## 658 Low 0.5578158104 0.44218419
## 659 Low 0.8677054479 0.13229455
## 660 Low 0.8660831750 0.13391682
## 664 Low 0.3154666895 0.68453331
## 666 Low 0.4094602677 0.59053973
## 667 Low 0.8101180279 0.18988197
## 675 Low 0.6603529106 0.33964709
## 680 Low 0.9641799867 0.03582001
## 681 Low 0.8594185600 0.14058144
## 687 Low 0.8801008895 0.11989911
## 694 Low 0.7275971055 0.27240289
## 697 Low 0.5638293124 0.43617069
## 701 Low 0.3818211000 0.61817890
## 705 Low 0.9049764620 0.09502354
## 707 Low 0.6651475934 0.33485241
## 710 Low 0.7664857792 0.23351422
## 716 Low 0.8679648226 0.13203518
## 719 Low 0.9391300604 0.06086994
## 720 Low 0.9749949067 0.02500509
## 725 Low 0.9760911253 0.02390887
## 727 Low 0.3818211000 0.61817890
## 730 Low 0.5141175113 0.48588249
## 738 Low 0.8712602625 0.12873974
## 745 Low 0.7303903111 0.26960969
## 748 Low 0.8143622451 0.18563775
## 751 Low 0.9236735195 0.07632648
## 756 Low 0.6594624920 0.34053751
## 766 Low 0.7257122288 0.27428777
## 769 Low 0.6219413487 0.37805865
## 783 Low 0.8773414681 0.12265853
## 785 Low 0.9119616001 0.08803840
## 790 Low 0.8742200814 0.12577992
## 793 Low 0.8773414681 0.12265853
## 795 Low 0.9873215475 0.01267845
## 796 Low 0.9797901295 0.02020987
## 797 Low 0.5245149624 0.47548504
## 801 Low 0.7819966694 0.21800333
## 811 Low 0.5830613359 0.41693866
## 812 Low 0.9209125651 0.07908743
## 815 Low 0.9579291216 0.04207088
## 816 Low 0.7563111238 0.24368888
## 817 Low 0.9554192278 0.04458077
## 824 Low 0.8910529193 0.10894708
## 825 Low 0.8910529193 0.10894708
## 826 Low 0.8910529193 0.10894708
## 830 Low 0.9206758207 0.07932418
## 837 Low 0.9458135660 0.05418643
## 838 Low 0.7563111238 0.24368888
## 844 Low 0.9344794385 0.06552056
## 845 Low 0.9788289697 0.02117103
## 847 Low 0.9342189025 0.06578110
## 850 Low 0.9042545052 0.09574549
## 852 Low 0.9162961404 0.08370386
## 853 Low 0.9162961404 0.08370386
## 861 Low 0.9270249964 0.07297500
## 868 Low 0.9821061387 0.01789386
## 874 Low 0.9367252603 0.06327474
## 879 High 0.1033547908 0.89664521
## 895 High 0.0101263144 0.98987369
## 899 High 0.0009470188 0.99905298
## 903 High 0.0101263144 0.98987369
## 917 High 0.0647560019 0.93524400
## 927 High 0.0211968888 0.97880311
## 929 High 0.1060669672 0.89393303
## 931 High 0.0211968888 0.97880311
## 933 High 0.3554801761 0.64451982
## 944 High 0.0433148853 0.95668511
## 947 High 0.0398746182 0.96012538
## 949 High 0.1877588381 0.81224116
## 953 High 0.0233211466 0.97667885
## 958 High 0.5637319742 0.43626803
## 961 High 0.0150665037 0.98493350
## 963 High 0.1129960710 0.88700393
## 964 High 0.0687190029 0.93128100
## 973 High 0.0532886213 0.94671138
## 976 High 0.0264753239 0.97352468
## 977 High 0.1682281293 0.83177187
## 980 High 0.2904095993 0.70959040
## 983 High 0.6727220240 0.32727798
## 984 High 0.1682281293 0.83177187
## 986 High 0.1089806628 0.89101934
## 989 High 0.2022144660 0.79778553
## 991 High 0.0240809741 0.97591903
## 996 High 0.0333823806 0.96661762
## 997 High 0.4975165837 0.50248342
## 999 High 0.0465501013 0.95344990
## 1000 High 0.0565000618 0.94349994
## 1003 High 0.0259764904 0.97402351
## 1008 High 0.1025560666 0.89744393
## 1009 High 0.4656557701 0.53434423
## 1014 High 0.0413949131 0.95860509
## 1015 High 0.4555939327 0.54440607
## 1040 High 0.1848342215 0.81516578
## 1042 High 0.3372214149 0.66277859
## 1043 High 0.7410621250 0.25893787
## 1050 High 0.1112632945 0.88873671
## 1052 High 0.1738234646 0.82617654
## 1056 High 0.3235717630 0.67642824
## 1070 High 0.5311755155 0.46882448
## 1073 High 0.5151860040 0.48481400
## 1074 High 0.1702113670 0.82978863
## 1079 High 0.4374952373 0.56250476
## 1080 High 0.6262738215 0.37372618
## 1085 High 0.0855709853 0.91442901
## 1087 High 0.6387004046 0.36129960
## 1096 High 0.8979928982 0.10200710
## 1099 High 0.4266976843 0.57330232
## 1100 High 0.7178153041 0.28218470
## 1102 High 0.0821493064 0.91785069
## 1107 Low 0.4800969559 0.51990304
## 1109 Low 0.6555391529 0.34446085
## 1114 Low 0.5204465731 0.47955343
## 1118 Low 0.3451967905 0.65480321
## 1123 Low 0.4281009860 0.57189901
## 1132 Low 0.8951347736 0.10486523
## 1134 Low 0.5798822776 0.42011772
## 1137 Low 0.3796756641 0.62032434
## 1154 Low 0.3796756641 0.62032434
## 1155 Low 0.5576566154 0.44234338
## 1157 Low 0.7299812013 0.27001880
## 1162 Low 0.6603529106 0.33964709
## 1164 Low 0.1596173546 0.84038265
## 1171 Low 0.9236735195 0.07632648
## 1172 Low 0.5324789498 0.46752105
## 1175 Low 0.6025547990 0.39744520
## 1177 Low 0.7033318635 0.29666814
## 1179 Low 0.8937618468 0.10623815
## 1183 Low 0.2760144412 0.72398556
## 1185 Low 0.9280768920 0.07192311
## 1189 Low 0.8086497299 0.19135027
## 1211 Low 0.8354424008 0.16455760
## 1218 Low 0.9892374352 0.01076256
## 1224 Low 0.4458631939 0.55413681
## 1225 Low 0.3818211000 0.61817890
## 1227 Low 0.8964097761 0.10359022
## 1232 Low 0.9844700142 0.01552999
## 1235 Low 0.7387527307 0.26124727
## 1238 Low 0.6806068698 0.31939313
## 1240 Low 0.8384203887 0.16157961
## 1241 Low 0.8354424008 0.16455760
## 1248 Low 0.9206758207 0.07932418
## 1258 Low 0.7563111238 0.24368888
## 1261 Low 0.8844033428 0.11559666
## 1263 Low 0.8910529193 0.10894708
## 1269 Low 0.9557472206 0.04425278
## 1270 Low 0.9853749858 0.01462501
## 1271 Low 0.9042545052 0.09574549
## 1272 Low 0.9162961404 0.08370386
## 1280 Low 0.9367252603 0.06327474
## 1286 Low 0.9875508520 0.01244915
## 1287 Low 0.9885358662 0.01146413
## 1289 Low 0.9523876595 0.04761234
## 1290 Low 0.9162961404 0.08370386
## 1291 High 0.1783988428 0.82160116
## 1294 High 0.8086918533 0.19130815
## 1305 Low 0.8712582117 0.12874179
## 1308 High 0.7148932523 0.28510675
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_YJT_Test$LR_YJT_Observed,
LR_YJT_Test_ROC predictor = LR_YJT_Test$LR_YJT_Predicted.High,
levels = rev(levels(LR_YJT_Test$LR_YJT_Observed)))
<- auc(LR_YJT_Test_ROC)[1]) (LR_YJT_Test_ROCCurveAUC
## [1] 0.8906181
##################################
# Creating a local object
# for the train and test sets
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- preProcess(PMA_PreModelling_Train_LR, method = c("expoTrans"))
Transform_Exponential <- predict(Transform_Exponential, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_ET $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_ET
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_ET[,sapply(PMA_PreModelling_Train_LR_ET, is.numeric)],
y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_ET[,sapply(PMA_PreModelling_Train_LR_ET, is.numeric)],
y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_ET$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_ET[,!names(PMA_PreModelling_Train_LR_ET) %in% c("Log_Solubility_Class")],
LR_ET_Tune y = PMA_PreModelling_Train_LR_ET$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_ET_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8805333 0.7187154 0.830225
$finalModel LR_ET_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 6.2403 1.4785 0.1379 -0.7367
## NumCarbon
## -0.1887
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 811.6 AIC: 821.6
$results LR_ET_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8805333 0.7187154 0.830225 0.03607516 0.07895841 0.06335689
<- LR_ET_Tune$results$ROC) (LR_ET_Train_ROCCurveAUC
## [1] 0.8805333
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_ET_Tune, scale = TRUE)
LR_ET_VarImp plot(LR_ET_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- predict(Transform_Exponential, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_ET $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_ET
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_ET[,sapply(PMA_PreModelling_Test_LR_ET, is.numeric)],
y = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_ET[,sapply(PMA_PreModelling_Test_LR_ET, is.numeric)],
y = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_ET Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_ET$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_ET_Observed = PMA_PreModelling_Test_LR_ET$Log_Solubility_Class,
LR_ET_Test LR_ET_Predicted = predict(LR_ET_Tune,
!names(PMA_PreModelling_Test_LR_ET) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_ET[,type = "prob"))
LR_ET_Test
## LR_ET_Observed LR_ET_Predicted.Low LR_ET_Predicted.High
## 20 High 0.013752867 0.98624713
## 21 High 0.009194143 0.99080586
## 23 High 0.044071687 0.95592831
## 25 High 0.015356098 0.98464390
## 28 High 0.062116193 0.93788381
## 31 High 0.030063697 0.96993630
## 32 High 0.055492297 0.94450770
## 33 High 0.081840730 0.91815927
## 34 High 0.081840730 0.91815927
## 37 High 0.362323531 0.63767647
## 38 High 0.362323531 0.63767647
## 42 High 0.529862825 0.47013717
## 49 High 0.175456910 0.82454309
## 54 High 0.027319430 0.97268057
## 55 High 0.022068424 0.97793158
## 58 High 0.443511260 0.55648874
## 60 High 0.082693666 0.91730633
## 61 High 0.081840730 0.91815927
## 65 High 0.048911337 0.95108866
## 69 High 0.533781158 0.46621884
## 73 High 0.032939911 0.96706009
## 86 High 0.067654064 0.93234594
## 90 High 0.046903055 0.95309695
## 91 High 0.016987566 0.98301243
## 93 High 0.046903055 0.95309695
## 96 High 0.016987566 0.98301243
## 98 High 0.053351489 0.94664851
## 100 High 0.066118585 0.93388142
## 104 High 0.892159099 0.10784090
## 112 High 0.226736762 0.77326324
## 115 High 0.880645514 0.11935449
## 119 High 0.153976304 0.84602370
## 128 High 0.153976304 0.84602370
## 130 High 0.023851968 0.97614803
## 139 High 0.023851968 0.97614803
## 143 High 0.038552905 0.96144709
## 145 High 0.114662957 0.88533704
## 146 High 0.153976304 0.84602370
## 149 High 0.152805307 0.84719469
## 150 High 0.133318136 0.86668186
## 152 High 0.081840730 0.91815927
## 157 High 0.513142137 0.48685786
## 161 High 0.245051293 0.75494871
## 162 High 0.010303091 0.98969691
## 166 High 0.528097208 0.47190279
## 167 High 0.164575958 0.83542404
## 173 High 0.116830681 0.88316932
## 176 High 0.153976304 0.84602370
## 182 High 0.020554918 0.97944508
## 187 High 0.049933006 0.95006699
## 190 High 0.020110335 0.97988967
## 194 High 0.023549684 0.97645032
## 195 High 0.178797120 0.82120288
## 201 High 0.076255775 0.92374422
## 207 High 0.117776492 0.88222351
## 208 High 0.511302373 0.48869763
## 215 High 0.030063697 0.96993630
## 222 High 0.244990513 0.75500949
## 224 High 0.182053122 0.81794688
## 231 High 0.749586694 0.25041331
## 236 High 0.097554294 0.90244571
## 237 High 0.041892008 0.95810799
## 240 High 0.185612595 0.81438741
## 243 High 0.076255775 0.92374422
## 248 High 0.178797120 0.82120288
## 251 High 0.775565527 0.22443447
## 256 High 0.398539794 0.60146021
## 258 High 0.169197351 0.83080265
## 262 High 0.511302373 0.48869763
## 266 High 0.452920897 0.54707910
## 272 High 0.570995637 0.42900436
## 280 High 0.296779366 0.70322063
## 283 High 0.443886349 0.55611365
## 286 High 0.435270439 0.56472956
## 287 High 0.168888318 0.83111168
## 289 High 0.099729418 0.90027058
## 290 High 0.316994422 0.68300558
## 298 High 0.318073613 0.68192639
## 305 High 0.294523101 0.70547690
## 306 High 0.222812248 0.77718775
## 312 High 0.101828758 0.89817124
## 320 High 0.314342671 0.68565733
## 325 High 0.176122072 0.82387793
## 332 High 0.058470118 0.94152988
## 333 High 0.457755060 0.54224494
## 335 High 0.318073613 0.68192639
## 339 High 0.772723681 0.22727632
## 346 High 0.347449045 0.65255096
## 347 High 0.044973673 0.95502633
## 350 High 0.382267109 0.61773289
## 353 High 0.349013643 0.65098636
## 358 High 0.321275416 0.67872458
## 365 High 0.224247273 0.77575273
## 367 High 0.199038406 0.80096159
## 370 High 0.048269146 0.95173085
## 379 High 0.128343126 0.87165687
## 386 High 0.362492091 0.63750791
## 394 High 0.522815273 0.47718473
## 396 High 0.130957795 0.86904220
## 400 High 0.041892008 0.95810799
## 404 High 0.038322544 0.96167746
## 405 High 0.626352870 0.37364713
## 413 High 0.131232803 0.86876720
## 415 High 0.371596531 0.62840347
## 417 High 0.212807558 0.78719244
## 418 High 0.441346753 0.55865325
## 423 High 0.306995054 0.69300495
## 434 High 0.283812634 0.71618737
## 437 High 0.274034772 0.72596523
## 440 High 0.378573196 0.62142680
## 449 High 0.362681763 0.63731824
## 450 High 0.273348151 0.72665185
## 457 High 0.362681763 0.63731824
## 467 High 0.325012206 0.67498779
## 469 High 0.221752671 0.77824733
## 474 High 0.936674227 0.06332577
## 475 High 0.904392341 0.09560766
## 485 High 0.132581065 0.86741894
## 504 Low 0.288558015 0.71144198
## 511 Low 0.780430879 0.21956912
## 512 Low 0.410836585 0.58916341
## 517 Low 0.082425749 0.91757425
## 519 Low 0.689505392 0.31049461
## 520 Low 0.075648339 0.92435166
## 522 Low 0.940550594 0.05944941
## 527 Low 0.635155993 0.36484401
## 528 Low 0.295989467 0.70401053
## 529 Low 0.301900165 0.69809983
## 537 Low 0.124694956 0.87530504
## 540 Low 0.924323450 0.07567655
## 541 Low 0.545190763 0.45480924
## 547 Low 0.920545497 0.07945450
## 550 Low 0.769867948 0.23013205
## 555 Low 0.500850846 0.49914915
## 564 Low 0.037015067 0.96298493
## 570 Low 0.354179129 0.64582087
## 573 Low 0.238387555 0.76161244
## 575 Low 0.545190763 0.45480924
## 578 Low 0.188629199 0.81137080
## 581 Low 0.238387555 0.76161244
## 585 Low 0.323229930 0.67677007
## 590 Low 0.723611110 0.27638889
## 601 Low 0.797056033 0.20294397
## 602 Low 0.613964299 0.38603570
## 607 Low 0.592073516 0.40792648
## 610 Low 0.577538414 0.42246159
## 618 Low 0.685443653 0.31455635
## 624 Low 0.323229930 0.67677007
## 626 Low 0.241740192 0.75825981
## 627 Low 0.357226702 0.64277330
## 634 Low 0.660110865 0.33988913
## 640 Low 0.982153120 0.01784688
## 642 Low 0.168057998 0.83194200
## 643 Low 0.628905923 0.37109408
## 644 Low 0.867026175 0.13297383
## 645 Low 0.773312713 0.22668729
## 646 Low 0.770707712 0.22929229
## 647 Low 0.714109400 0.28589060
## 652 Low 0.191323910 0.80867609
## 658 Low 0.575949760 0.42405024
## 659 Low 0.851385294 0.14861471
## 660 Low 0.880217375 0.11978263
## 664 Low 0.319822683 0.68017732
## 666 Low 0.416063033 0.58393697
## 667 Low 0.827447505 0.17255250
## 675 Low 0.628905923 0.37109408
## 680 Low 0.964633230 0.03536677
## 681 Low 0.874768134 0.12523187
## 687 Low 0.866602938 0.13339706
## 694 Low 0.748302599 0.25169740
## 697 Low 0.559681179 0.44031882
## 701 Low 0.359811881 0.64018812
## 705 Low 0.912000344 0.08799966
## 707 Low 0.700044807 0.29995519
## 710 Low 0.743622998 0.25637700
## 716 Low 0.885887800 0.11411220
## 719 Low 0.939669824 0.06033018
## 720 Low 0.974804926 0.02519507
## 725 Low 0.976039979 0.02396002
## 727 Low 0.359811881 0.64018812
## 730 Low 0.504435382 0.49556462
## 738 Low 0.859253410 0.14074659
## 745 Low 0.702405944 0.29759406
## 748 Low 0.808544204 0.19145580
## 751 Low 0.933147457 0.06685254
## 756 Low 0.685429835 0.31457016
## 766 Low 0.751514843 0.24848516
## 769 Low 0.602517547 0.39748245
## 783 Low 0.869255200 0.13074480
## 785 Low 0.902579681 0.09742032
## 790 Low 0.886834280 0.11316572
## 793 Low 0.869255200 0.13074480
## 795 Low 0.986490055 0.01350995
## 796 Low 0.979675106 0.02032489
## 797 Low 0.555989916 0.44401008
## 801 Low 0.775047694 0.22495231
## 811 Low 0.556377102 0.44362290
## 812 Low 0.929865865 0.07013413
## 815 Low 0.957749970 0.04225003
## 816 Low 0.758028058 0.24197194
## 817 Low 0.951236951 0.04876305
## 824 Low 0.886411056 0.11358894
## 825 Low 0.886411056 0.11358894
## 826 Low 0.886411056 0.11358894
## 830 Low 0.912117000 0.08788300
## 837 Low 0.940840521 0.05915948
## 838 Low 0.758028058 0.24197194
## 844 Low 0.927974648 0.07202535
## 845 Low 0.976894550 0.02310545
## 847 Low 0.933393677 0.06660632
## 850 Low 0.902116294 0.09788371
## 852 Low 0.915857107 0.08414289
## 853 Low 0.915857107 0.08414289
## 861 Low 0.927636110 0.07236389
## 868 Low 0.980340828 0.01965917
## 874 Low 0.937852871 0.06214713
## 879 High 0.113209880 0.88679012
## 895 High 0.014930232 0.98506977
## 899 High 0.001859651 0.99814035
## 903 High 0.014930232 0.98506977
## 917 High 0.056418013 0.94358199
## 927 High 0.027319430 0.97268057
## 929 High 0.116830681 0.88316932
## 931 High 0.027319430 0.97268057
## 933 High 0.362953760 0.63704624
## 944 High 0.043101463 0.95689854
## 947 High 0.046903055 0.95309695
## 949 High 0.184608445 0.81539155
## 953 High 0.021584412 0.97841559
## 958 High 0.536045815 0.46395419
## 961 High 0.015768127 0.98423187
## 963 High 0.119476478 0.88052352
## 964 High 0.076255775 0.92374422
## 973 High 0.049972348 0.95002765
## 976 High 0.037015067 0.96298493
## 977 High 0.178797120 0.82120288
## 980 High 0.275965540 0.72403446
## 983 High 0.650018484 0.34998152
## 984 High 0.178797120 0.82120288
## 986 High 0.101828758 0.89817124
## 989 High 0.202228735 0.79777127
## 991 High 0.023066118 0.97693388
## 996 High 0.050382284 0.94961772
## 997 High 0.470933547 0.52906645
## 999 High 0.044071687 0.95592831
## 1000 High 0.051069687 0.94893031
## 1003 High 0.041892008 0.95810799
## 1008 High 0.099376550 0.90062345
## 1009 High 0.443886349 0.55611365
## 1014 High 0.041232631 0.95876737
## 1015 High 0.470359921 0.52964008
## 1040 High 0.177580468 0.82241953
## 1042 High 0.339492531 0.66050747
## 1043 High 0.730538815 0.26946119
## 1050 High 0.104620962 0.89537904
## 1052 High 0.180674837 0.81932516
## 1056 High 0.333657886 0.66634211
## 1070 High 0.555232608 0.44476739
## 1073 High 0.486744863 0.51325514
## 1074 High 0.177106183 0.82289382
## 1079 High 0.416066054 0.58393395
## 1080 High 0.599211550 0.40078845
## 1085 High 0.082405863 0.91759414
## 1087 High 0.657183051 0.34281695
## 1096 High 0.909565322 0.09043468
## 1099 High 0.440418289 0.55958171
## 1100 High 0.699230090 0.30076991
## 1102 High 0.071892116 0.92810788
## 1107 Low 0.454403473 0.54559653
## 1109 Low 0.675224568 0.32477543
## 1114 Low 0.491991507 0.50800849
## 1118 Low 0.356946854 0.64305315
## 1123 Low 0.432009924 0.56799008
## 1132 Low 0.886272790 0.11372721
## 1134 Low 0.603826999 0.39617300
## 1137 Low 0.357226702 0.64277330
## 1154 Low 0.357226702 0.64277330
## 1155 Low 0.582463212 0.41753679
## 1157 Low 0.765117005 0.23488300
## 1162 Low 0.628905923 0.37109408
## 1164 Low 0.168057998 0.83194200
## 1171 Low 0.933147457 0.06685254
## 1172 Low 0.513142137 0.48685786
## 1175 Low 0.627703257 0.37229674
## 1177 Low 0.687471315 0.31252868
## 1179 Low 0.902338418 0.09766158
## 1183 Low 0.268533199 0.73146680
## 1185 Low 0.934234085 0.06576592
## 1189 Low 0.820187858 0.17981214
## 1211 Low 0.816488095 0.18351190
## 1218 Low 0.987709192 0.01229081
## 1224 Low 0.418627487 0.58137251
## 1225 Low 0.359811881 0.64018812
## 1227 Low 0.906940352 0.09305965
## 1232 Low 0.983937916 0.01606208
## 1235 Low 0.755569564 0.24443044
## 1238 Low 0.713703436 0.28629656
## 1240 Low 0.855212853 0.14478715
## 1241 Low 0.816488095 0.18351190
## 1248 Low 0.912117000 0.08788300
## 1258 Low 0.758028058 0.24197194
## 1261 Low 0.886758010 0.11324199
## 1263 Low 0.886411056 0.11358894
## 1269 Low 0.955630849 0.04436915
## 1270 Low 0.985258898 0.01474110
## 1271 Low 0.902116294 0.09788371
## 1272 Low 0.915857107 0.08414289
## 1280 Low 0.937852871 0.06214713
## 1286 Low 0.986020488 0.01397951
## 1287 Low 0.987076206 0.01292379
## 1289 Low 0.953597078 0.04640292
## 1290 Low 0.915857107 0.08414289
## 1291 High 0.175452898 0.82454710
## 1294 High 0.799908047 0.20009195
## 1305 Low 0.880239111 0.11976089
## 1308 High 0.727385856 0.27261414
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_ET_Test$LR_ET_Observed,
LR_ET_Test_ROC predictor = LR_ET_Test$LR_ET_Predicted.High,
levels = rev(levels(LR_ET_Test$LR_ET_Observed)))
<- auc(LR_ET_Test_ROC)[1]) (LR_ET_Test_ROCCurveAUC
## [1] 0.890133
##################################
# Applying inverse hyperbolic sine function
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- log(PMA_PreModelling_Train[,i]+(((PMA_PreModelling_Train[,i])^2)+1)^(1/2))
PMA_PreModelling_Train[,i]
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
<- log(PMA_PreModelling_Test[,i]+(((PMA_PreModelling_Test[,i])^2)+1)^(1/2))
PMA_PreModelling_Test[,i]
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_IHST $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_IHST
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_IHST[,sapply(PMA_PreModelling_Train_LR_IHST, is.numeric)],
y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_IHST[,sapply(PMA_PreModelling_Train_LR_IHST, is.numeric)],
y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_IHST[,!names(PMA_PreModelling_Train_LR_IHST) %in% c("Log_Solubility_Class")],
LR_IHST_Tune y = PMA_PreModelling_Train_LR_IHST$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_IHST_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8710722 0.7374862 0.8073657
$finalModel LR_IHST_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 10.721 1.364 1.208 -3.671
## NumCarbon
## -1.128
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 836 AIC: 846
$results LR_IHST_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8710722 0.7374862 0.8073657 0.04019999 0.06659904 0.0606913
<- LR_IHST_Tune$results$ROC) (LR_IHST_Train_ROCCurveAUC
## [1] 0.8710722
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_IHST_Tune, scale = TRUE)
LR_IHST_VarImp plot(LR_IHST_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_IHST $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_IHST
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_IHST[,sapply(PMA_PreModelling_Test_LR_IHST, is.numeric)],
y = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_IHST[,sapply(PMA_PreModelling_Test_LR_IHST, is.numeric)],
y = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_IHST Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_IHST_Observed = PMA_PreModelling_Test_LR_IHST$Log_Solubility_Class,
LR_IHST_Test LR_IHST_Predicted = predict(LR_IHST_Tune,
!names(PMA_PreModelling_Test_LR_IHST) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_IHST[,type = "prob"))
LR_IHST_Test
## LR_IHST_Observed LR_IHST_Predicted.Low LR_IHST_Predicted.High
## 20 High 0.0099300682 0.99006993
## 21 High 0.0070843515 0.99291565
## 23 High 0.0578716298 0.94212837
## 25 High 0.0182500257 0.98174997
## 28 High 0.0801383023 0.91986170
## 31 High 0.0150075039 0.98499250
## 32 High 0.0292220184 0.97077798
## 33 High 0.0524819225 0.94751808
## 34 High 0.0524819225 0.94751808
## 37 High 0.3152799614 0.68472004
## 38 High 0.3152799614 0.68472004
## 42 High 0.6136673812 0.38633262
## 49 High 0.1598231577 0.84017684
## 54 High 0.0356333544 0.96436665
## 55 High 0.0219177360 0.97808226
## 58 High 0.4549805179 0.54501948
## 60 High 0.1172175407 0.88278246
## 61 High 0.0524819225 0.94751808
## 65 High 0.0609565698 0.93904343
## 69 High 0.5374744295 0.46252557
## 73 High 0.0182159523 0.98178405
## 86 High 0.0487705078 0.95122949
## 90 High 0.0693209815 0.93067902
## 91 High 0.0121654002 0.98783460
## 93 High 0.0693209815 0.93067902
## 96 High 0.0121654002 0.98783460
## 98 High 0.0775264455 0.92247355
## 100 High 0.0904961248 0.90950388
## 104 High 0.8993420391 0.10065796
## 112 High 0.1883413325 0.81165867
## 115 High 0.8526100741 0.14738993
## 119 High 0.1405876713 0.85941233
## 128 High 0.1405876713 0.85941233
## 130 High 0.0236487683 0.97635123
## 139 High 0.0236487683 0.97635123
## 143 High 0.0520388523 0.94796115
## 145 High 0.0793526698 0.92064733
## 146 High 0.1405876713 0.85941233
## 149 High 0.2122457494 0.78775425
## 150 High 0.1009300825 0.89906992
## 152 High 0.0524819225 0.94751808
## 157 High 0.5269860724 0.47301393
## 161 High 0.3337627789 0.66623722
## 162 High 0.0062680045 0.99373200
## 166 High 0.5354668322 0.46453317
## 167 High 0.1260047503 0.87399525
## 173 High 0.0887542160 0.91124578
## 176 High 0.1405876713 0.85941233
## 182 High 0.0061870117 0.99381299
## 187 High 0.0576049610 0.94239504
## 190 High 0.0156622796 0.98433772
## 194 High 0.0105820641 0.98941794
## 195 High 0.2551363208 0.74486368
## 201 High 0.1185994482 0.88140055
## 207 High 0.1831137154 0.81688628
## 208 High 0.4561448837 0.54385512
## 215 High 0.0150075039 0.98499250
## 222 High 0.3377831981 0.66221680
## 224 High 0.2291389419 0.77086106
## 231 High 0.7081685175 0.29183148
## 236 High 0.1339363550 0.86606365
## 237 High 0.0201223973 0.97987760
## 240 High 0.1926954630 0.80730454
## 243 High 0.1185994482 0.88140055
## 248 High 0.2551363208 0.74486368
## 251 High 0.8071616550 0.19283835
## 256 High 0.4973558109 0.50264419
## 258 High 0.2393589007 0.76064110
## 262 High 0.4561448837 0.54385512
## 266 High 0.5304409692 0.46955903
## 272 High 0.5720104543 0.42798955
## 280 High 0.3631225228 0.63687748
## 283 High 0.4275080021 0.57249200
## 286 High 0.5141733831 0.48582662
## 287 High 0.2435576257 0.75644237
## 289 High 0.0848641421 0.91513586
## 290 High 0.3819373209 0.61806268
## 298 High 0.3093894899 0.69061051
## 305 High 0.3858465603 0.61415344
## 306 High 0.2315157545 0.76848425
## 312 High 0.1353064070 0.86469359
## 320 High 0.2745723275 0.72542767
## 325 High 0.2358766959 0.76412330
## 332 High 0.0836961949 0.91630381
## 333 High 0.4080892379 0.59191076
## 335 High 0.3093894899 0.69061051
## 339 High 0.7128384446 0.28716156
## 346 High 0.4472904099 0.55270959
## 347 High 0.0300015916 0.96999841
## 350 High 0.3449964094 0.65500359
## 353 High 0.4462698817 0.55373012
## 358 High 0.4225671930 0.57743281
## 365 High 0.3039458499 0.69605415
## 367 High 0.2495985988 0.75040140
## 370 High 0.0427492613 0.95725074
## 379 High 0.1399298135 0.86007019
## 386 High 0.2689183845 0.73108162
## 394 High 0.6069330289 0.39306697
## 396 High 0.1774691742 0.82253083
## 400 High 0.0201223973 0.97987760
## 404 High 0.0221953778 0.97780462
## 405 High 0.5922002466 0.40779975
## 413 High 0.1989446496 0.80105535
## 415 High 0.4673768642 0.53262314
## 417 High 0.1389459796 0.86105402
## 418 High 0.5397594210 0.46024058
## 423 High 0.2609195742 0.73908043
## 434 High 0.3784020245 0.62159798
## 437 High 0.2626702786 0.73732972
## 440 High 0.3698931740 0.63010683
## 449 High 0.4326697948 0.56733021
## 450 High 0.3544261568 0.64557384
## 457 High 0.4326697948 0.56733021
## 467 High 0.3153110496 0.68468895
## 469 High 0.1527762476 0.84722375
## 474 High 0.9075381438 0.09246186
## 475 High 0.8765506823 0.12344932
## 485 High 0.1579281927 0.84207181
## 504 Low 0.2444521260 0.75554787
## 511 Low 0.5871849577 0.41281504
## 512 Low 0.5018320598 0.49816794
## 517 Low 0.0668902556 0.93310974
## 519 Low 0.7251116360 0.27488836
## 520 Low 0.0482164150 0.95178359
## 522 Low 0.9083535976 0.09164640
## 527 Low 0.6995000622 0.30049994
## 528 Low 0.2166802349 0.78331977
## 529 Low 0.3803802257 0.61961977
## 537 Low 0.0944886922 0.90551131
## 540 Low 0.8937605539 0.10623945
## 541 Low 0.4523082593 0.54769174
## 547 Low 0.8943000868 0.10569991
## 550 Low 0.5168207177 0.48317928
## 555 Low 0.5751405754 0.42485942
## 564 Low 0.0210364403 0.97896356
## 570 Low 0.4411424281 0.55885757
## 573 Low 0.3206488297 0.67935117
## 575 Low 0.4523082593 0.54769174
## 578 Low 0.1228104528 0.87718955
## 581 Low 0.3206488297 0.67935117
## 585 Low 0.4095958799 0.59040412
## 590 Low 0.7649838590 0.23501614
## 601 Low 0.7912963243 0.20870368
## 602 Low 0.6719001646 0.32809984
## 607 Low 0.6636629369 0.33633706
## 610 Low 0.6521049110 0.34789509
## 618 Low 0.7096458155 0.29035418
## 624 Low 0.4095958799 0.59040412
## 626 Low 0.3435856801 0.65641432
## 627 Low 0.3417805970 0.65821940
## 634 Low 0.5277070848 0.47229292
## 640 Low 0.9701320053 0.02986799
## 642 Low 0.1097637588 0.89023624
## 643 Low 0.5390069187 0.46099308
## 644 Low 0.8335923318 0.16640767
## 645 Low 0.7520869618 0.24791304
## 646 Low 0.6851967239 0.31480328
## 647 Low 0.6972416523 0.30275835
## 652 Low 0.1601891216 0.83981088
## 658 Low 0.6323549805 0.36764502
## 659 Low 0.7784373668 0.22156263
## 660 Low 0.8882344180 0.11176558
## 664 Low 0.3808059125 0.61919409
## 666 Low 0.4848211779 0.51517882
## 667 Low 0.8424237841 0.15757622
## 675 Low 0.5390069187 0.46099308
## 680 Low 0.9646714157 0.03532858
## 681 Low 0.8853530190 0.11464698
## 687 Low 0.7979014677 0.20209853
## 694 Low 0.7863428810 0.21365712
## 697 Low 0.5780038485 0.42199615
## 701 Low 0.2845925670 0.71540743
## 705 Low 0.9200100288 0.07998997
## 707 Low 0.7455518887 0.25444811
## 710 Low 0.6831664789 0.31683352
## 716 Low 0.8903703029 0.10962970
## 719 Low 0.9144792544 0.08552075
## 720 Low 0.9537538416 0.04624616
## 725 Low 0.9577964846 0.04220352
## 727 Low 0.2845925670 0.71540743
## 730 Low 0.5180529588 0.48194704
## 738 Low 0.8051588342 0.19484117
## 745 Low 0.6164473390 0.38355266
## 748 Low 0.7636704477 0.23632955
## 751 Low 0.9320117380 0.06798826
## 756 Low 0.7357943834 0.26420562
## 766 Low 0.7940349873 0.20596501
## 769 Low 0.6140644801 0.38593552
## 783 Low 0.8278040363 0.17219596
## 785 Low 0.8433731544 0.15662685
## 790 Low 0.8939502894 0.10604971
## 793 Low 0.8278040363 0.17219596
## 795 Low 0.9757396053 0.02426039
## 796 Low 0.9631790196 0.03682098
## 797 Low 0.6283445387 0.37165546
## 801 Low 0.7663863103 0.23361369
## 811 Low 0.5728956904 0.42710431
## 812 Low 0.9288809270 0.07111907
## 815 Low 0.9245343078 0.07546569
## 816 Low 0.7513710061 0.24862899
## 817 Low 0.9110816646 0.08891834
## 824 Low 0.8539043619 0.14609564
## 825 Low 0.8539043619 0.14609564
## 826 Low 0.8539043619 0.14609564
## 830 Low 0.8557141199 0.14428588
## 837 Low 0.8960597684 0.10394023
## 838 Low 0.7513710061 0.24862899
## 844 Low 0.8776431746 0.12235683
## 845 Low 0.9529465696 0.04705343
## 847 Low 0.9085970121 0.09140299
## 850 Low 0.8755465620 0.12445344
## 852 Low 0.8934471700 0.10655283
## 853 Low 0.8934471700 0.10655283
## 861 Low 0.9082923526 0.09170765
## 868 Low 0.9589272001 0.04107280
## 874 Low 0.9207968258 0.07920317
## 879 High 0.0946843934 0.90531561
## 895 High 0.0154204755 0.98457952
## 899 High 0.0001948401 0.99980516
## 903 High 0.0154204755 0.98457952
## 917 High 0.0596706125 0.94032939
## 927 High 0.0356333544 0.96436665
## 929 High 0.0887542160 0.91124578
## 931 High 0.0356333544 0.96436665
## 933 High 0.4369472072 0.56305279
## 944 High 0.0614810203 0.93851898
## 947 High 0.0693209815 0.93067902
## 949 High 0.2464938138 0.75350619
## 953 High 0.0184962609 0.98150374
## 958 High 0.4853511053 0.51464889
## 961 High 0.0213531141 0.97864689
## 963 High 0.1129852773 0.88701472
## 964 High 0.1185994482 0.88140055
## 973 High 0.0653591674 0.93464083
## 976 High 0.0210364403 0.97896356
## 977 High 0.2551363208 0.74486368
## 980 High 0.2741358706 0.72586413
## 983 High 0.6342816182 0.36571838
## 984 High 0.2551363208 0.74486368
## 986 High 0.1353064070 0.86469359
## 989 High 0.1686883204 0.83131168
## 991 High 0.0281518759 0.97184812
## 996 High 0.0254700687 0.97452993
## 997 High 0.4490879688 0.55091203
## 999 High 0.0578716298 0.94212837
## 1000 High 0.0600594201 0.93994058
## 1003 High 0.0201223973 0.97987760
## 1008 High 0.1412171683 0.85878283
## 1009 High 0.4275080021 0.57249200
## 1014 High 0.0667172124 0.93328279
## 1015 High 0.5346752396 0.46532476
## 1040 High 0.2265614990 0.77343850
## 1042 High 0.4024945490 0.59750545
## 1043 High 0.7138824116 0.28611759
## 1050 High 0.1370627240 0.86293728
## 1052 High 0.2538067860 0.74619321
## 1056 High 0.1534757420 0.84652426
## 1070 High 0.6331959179 0.36680408
## 1073 High 0.5012647881 0.49873521
## 1074 High 0.1370051566 0.86299484
## 1079 High 0.4040490277 0.59595097
## 1080 High 0.5480798185 0.45192018
## 1085 High 0.0730303379 0.92696966
## 1087 High 0.7155593169 0.28444068
## 1096 High 0.9127890547 0.08721095
## 1099 High 0.5132163700 0.48678363
## 1100 High 0.6607261272 0.33927387
## 1102 High 0.0746000948 0.92539991
## 1107 Low 0.3601428127 0.63985719
## 1109 Low 0.7166172165 0.28338278
## 1114 Low 0.3934733313 0.60652667
## 1118 Low 0.4477738160 0.55222618
## 1123 Low 0.4773018833 0.52269812
## 1132 Low 0.8357851715 0.16421483
## 1134 Low 0.6748232479 0.32517675
## 1137 Low 0.3417805970 0.65821940
## 1154 Low 0.3417805970 0.65821940
## 1155 Low 0.6580481264 0.34195187
## 1157 Low 0.7989572984 0.20104270
## 1162 Low 0.5390069187 0.46099308
## 1164 Low 0.1097637588 0.89023624
## 1171 Low 0.9320117380 0.06798826
## 1172 Low 0.5269860724 0.47301393
## 1175 Low 0.6872275852 0.31277241
## 1177 Low 0.6517991969 0.34820080
## 1179 Low 0.9066712497 0.09332875
## 1183 Low 0.1968901528 0.80310985
## 1185 Low 0.9353748426 0.06462516
## 1189 Low 0.8149583318 0.18504167
## 1211 Low 0.7381952599 0.26180474
## 1218 Low 0.9802534524 0.01974655
## 1224 Low 0.4232012115 0.57679879
## 1225 Low 0.2845925670 0.71540743
## 1227 Low 0.9120709631 0.08792904
## 1232 Low 0.9714750094 0.02852499
## 1235 Low 0.7910733456 0.20892665
## 1238 Low 0.7635803175 0.23641968
## 1240 Low 0.8695451356 0.13045486
## 1241 Low 0.7381952599 0.26180474
## 1248 Low 0.8557141199 0.14428588
## 1258 Low 0.7513710061 0.24862899
## 1261 Low 0.8706811127 0.12931889
## 1263 Low 0.8539043619 0.14609564
## 1269 Low 0.9367083202 0.06329168
## 1270 Low 0.9728605279 0.02713947
## 1271 Low 0.8755465620 0.12445344
## 1272 Low 0.8934471700 0.10655283
## 1280 Low 0.9207968258 0.07920317
## 1286 Low 0.9697385070 0.03026149
## 1287 Low 0.9716992194 0.02830078
## 1289 Low 0.9400342381 0.05996576
## 1290 Low 0.8934471700 0.10655283
## 1291 High 0.2366679120 0.76333209
## 1294 High 0.7680428374 0.23195716
## 1305 Low 0.8943028215 0.10569718
## 1308 High 0.7223131307 0.27768687
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_IHST_Test$LR_IHST_Observed,
LR_IHST_Test_ROC predictor = LR_IHST_Test$LR_IHST_Predicted.High,
levels = rev(levels(LR_IHST_Test$LR_IHST_Observed)))
<- auc(LR_IHST_Test_ROC)[1]) (LR_IHST_Test_ROCCurveAUC
## [1] 0.8799871
##################################
# Applying numerical adjustments
# to eliminate zero values and
# base10-logarithm function
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- log10(PMA_PreModelling_Train[,i])
PMA_PreModelling_Train[,i]
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
<- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- log10(PMA_PreModelling_Test[,i])
PMA_PreModelling_Test[,i]
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_LOG10T $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_LOG10T
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_LOG10T[,sapply(PMA_PreModelling_Train_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_LOG10T[,sapply(PMA_PreModelling_Train_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_LOG10T[,!names(PMA_PreModelling_Train_LR_LOG10T) %in% c("Log_Solubility_Class")],
LR_LOG10T_Tune y = PMA_PreModelling_Train_LR_LOG10T$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_LOG10T_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.891921 0.7492802 0.8245646
$finalModel LR_LOG10T_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 11.417 2.928 3.279 -14.187
## NumCarbon
## 0.934
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 767.4 AIC: 777.4
$results LR_LOG10T_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.891921 0.7492802 0.8245646 0.03393011 0.06711081 0.06719134
<- LR_LOG10T_Tune$results$ROC) (LR_LOG10T_Train_ROCCurveAUC
## [1] 0.891921
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_LOG10T_Tune, scale = TRUE)
LR_LOG10T_VarImp plot(LR_LOG10T_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_LOG10T $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_LOG10T
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_LOG10T[,sapply(PMA_PreModelling_Test_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_LOG10T[,sapply(PMA_PreModelling_Test_LR_LOG10T, is.numeric)],
y = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LOG10T Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_LOG10T_Observed = PMA_PreModelling_Test_LR_LOG10T$Log_Solubility_Class,
LR_LOG10T_Test LR_LOG10T_Predicted = predict(LR_LOG10T_Tune,
!names(PMA_PreModelling_Test_LR_LOG10T) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_LOG10T[,type = "prob"))
LR_LOG10T_Test
## LR_LOG10T_Observed LR_LOG10T_Predicted.Low LR_LOG10T_Predicted.High
## 20 High 0.0114244517 0.9885755483
## 21 High 0.0080310909 0.9919689091
## 23 High 0.0354283423 0.9645716577
## 25 High 0.0113319188 0.9886680812
## 28 High 0.0603784377 0.9396215623
## 31 High 0.0126421937 0.9873578063
## 32 High 0.0204088121 0.9795911879
## 33 High 0.0401610347 0.9598389653
## 34 High 0.0401610347 0.9598389653
## 37 High 0.3543519894 0.6456480106
## 38 High 0.3543519894 0.6456480106
## 42 High 0.4516439293 0.5483560707
## 49 High 0.1326709535 0.8673290465
## 54 High 0.0127138418 0.9872861582
## 55 High 0.0171425230 0.9828574770
## 58 High 0.4508886389 0.5491113611
## 60 High 0.0458725899 0.9541274101
## 61 High 0.0401610347 0.9598389653
## 65 High 0.0422682755 0.9577317245
## 69 High 0.5360931241 0.4639068759
## 73 High 0.0313344498 0.9686655502
## 86 High 0.0379168490 0.9620831510
## 90 High 0.0246356037 0.9753643963
## 91 High 0.0112519351 0.9887480649
## 93 High 0.0246356037 0.9753643963
## 96 High 0.0112519351 0.9887480649
## 98 High 0.0281163829 0.9718836171
## 100 High 0.0455991395 0.9544008605
## 104 High 0.8596843413 0.1403156587
## 112 High 0.1899203318 0.8100796682
## 115 High 0.8830092589 0.1169907411
## 119 High 0.1145296081 0.8854703919
## 128 High 0.1145296081 0.8854703919
## 130 High 0.0186594977 0.9813405023
## 139 High 0.0186594977 0.9813405023
## 143 High 0.0369097104 0.9630902896
## 145 High 0.0632876797 0.9367123203
## 146 High 0.1145296081 0.8854703919
## 149 High 0.1330597875 0.8669402125
## 150 High 0.0881581152 0.9118418848
## 152 High 0.0401610347 0.9598389653
## 157 High 0.5491594648 0.4508405352
## 161 High 0.1818965999 0.8181034001
## 162 High 0.0114108307 0.9885891693
## 166 High 0.5306213331 0.4693786669
## 167 High 0.1130629002 0.8869370998
## 173 High 0.0762280843 0.9237719157
## 176 High 0.1145296081 0.8854703919
## 182 High 0.0040896523 0.9959103477
## 187 High 0.0411811702 0.9588188298
## 190 High 0.0232842897 0.9767157103
## 194 High 0.0175444271 0.9824555729
## 195 High 0.1108113031 0.8891886969
## 201 High 0.0436230271 0.9563769729
## 207 High 0.0716468222 0.9283531778
## 208 High 0.5395740091 0.4604259909
## 215 High 0.0126421937 0.9873578063
## 222 High 0.1592460118 0.8407539882
## 224 High 0.2064348880 0.7935651120
## 231 High 0.7632446817 0.2367553183
## 236 High 0.0866704378 0.9133295622
## 237 High 0.0136419825 0.9863580175
## 240 High 0.2146524994 0.7853475006
## 243 High 0.0436230271 0.9563769729
## 248 High 0.1108113031 0.8891886969
## 251 High 0.6961143000 0.3038857000
## 256 High 0.3070340842 0.6929659158
## 258 High 0.1230410001 0.8769589999
## 262 High 0.5395740091 0.4604259909
## 266 High 0.4571965619 0.5428034381
## 272 High 0.5720886850 0.4279113150
## 280 High 0.3082654815 0.6917345185
## 283 High 0.4354389857 0.5645610143
## 286 High 0.4391327329 0.5608672671
## 287 High 0.1041477820 0.8958522180
## 289 High 0.1437887272 0.8562112728
## 290 High 0.3282018426 0.6717981574
## 298 High 0.2939848627 0.7060151373
## 305 High 0.2492525888 0.7507474112
## 306 High 0.2398758877 0.7601241123
## 312 High 0.0973381705 0.9026618295
## 320 High 0.3058707588 0.6941292412
## 325 High 0.1771522624 0.8228477376
## 332 High 0.0556061812 0.9443938188
## 333 High 0.4847787185 0.5152212815
## 335 High 0.2939848627 0.7060151373
## 339 High 0.8371427968 0.1628572032
## 346 High 0.2382954634 0.7617045366
## 347 High 0.0327311005 0.9672688995
## 350 High 0.4097345492 0.5902654508
## 353 High 0.2663701600 0.7336298400
## 358 High 0.2182251914 0.7817748086
## 365 High 0.1922987093 0.8077012907
## 367 High 0.2058056954 0.7941943046
## 370 High 0.0651543422 0.9348456578
## 379 High 0.1327838973 0.8672161027
## 386 High 0.5247916818 0.4752083182
## 394 High 0.4697661230 0.5302338770
## 396 High 0.1215635914 0.8784364086
## 400 High 0.0136419825 0.9863580175
## 404 High 0.0493865148 0.9506134852
## 405 High 0.6323780570 0.3676219430
## 413 High 0.0931860048 0.9068139952
## 415 High 0.3133445051 0.6866554949
## 417 High 0.2643581720 0.7356418280
## 418 High 0.3679591512 0.6320408488
## 423 High 0.2714888903 0.7285111097
## 434 High 0.1877329621 0.8122670379
## 437 High 0.3396836347 0.6603163653
## 440 High 0.4302378412 0.5697621588
## 449 High 0.3642274184 0.6357725816
## 450 High 0.2638891235 0.7361108765
## 457 High 0.3642274184 0.6357725816
## 467 High 0.3843344069 0.6156655931
## 469 High 0.3427771468 0.6572228532
## 474 High 0.9448977678 0.0551022322
## 475 High 0.9069691143 0.0930308857
## 485 High 0.1206262001 0.8793737999
## 504 Low 0.2810994647 0.7189005353
## 511 Low 0.7713879404 0.2286120596
## 512 Low 0.3771323094 0.6228676906
## 517 Low 0.0796952208 0.9203047792
## 519 Low 0.6631915667 0.3368084333
## 520 Low 0.0365399927 0.9634600073
## 522 Low 0.9567591567 0.0432408433
## 527 Low 0.5472028728 0.4527971272
## 528 Low 0.4455816601 0.5544183399
## 529 Low 0.2855557521 0.7144442479
## 537 Low 0.0818301918 0.9181698082
## 540 Low 0.9324385799 0.0675614201
## 541 Low 0.7941384873 0.2058615127
## 547 Low 0.9225916586 0.0774083414
## 550 Low 0.7281742163 0.2718257837
## 555 Low 0.4391059978 0.5608940022
## 564 Low 0.0179846866 0.9820153134
## 570 Low 0.3080507810 0.6919492190
## 573 Low 0.2060658806 0.7939341194
## 575 Low 0.7941384873 0.2058615127
## 578 Low 0.2347216890 0.7652783110
## 581 Low 0.2060658806 0.7939341194
## 585 Low 0.3173015329 0.6826984671
## 590 Low 0.6875129427 0.3124870573
## 601 Low 0.7826540228 0.2173459772
## 602 Low 0.6143605885 0.3856394115
## 607 Low 0.4573521726 0.5426478274
## 610 Low 0.4429636059 0.5570363941
## 618 Low 0.7158484958 0.2841515042
## 624 Low 0.3173015329 0.6826984671
## 626 Low 0.1600193605 0.8399806395
## 627 Low 0.3215503018 0.6784496982
## 634 Low 0.6883858742 0.3116141258
## 640 Low 0.9844093361 0.0155906639
## 642 Low 0.2100918801 0.7899081199
## 643 Low 0.8692472243 0.1307527757
## 644 Low 0.8718338373 0.1281661627
## 645 Low 0.7731505226 0.2268494774
## 646 Low 0.9397436656 0.0602563344
## 647 Low 0.7401939612 0.2598060388
## 652 Low 0.1584162203 0.8415837797
## 658 Low 0.5345180440 0.4654819560
## 659 Low 0.9685770972 0.0314229028
## 660 Low 0.8148166346 0.1851833654
## 664 Low 0.3515609710 0.6484390290
## 666 Low 0.4641158585 0.5358841415
## 667 Low 0.8356306242 0.1643693758
## 675 Low 0.8692472243 0.1307527757
## 680 Low 0.9606395556 0.0393604444
## 681 Low 0.8406617830 0.1593382170
## 687 Low 0.9754003752 0.0245996248
## 694 Low 0.6710783028 0.3289216972
## 697 Low 0.5964862294 0.4035137706
## 701 Low 0.5884750688 0.4115249312
## 705 Low 0.8954203536 0.1045796464
## 707 Low 0.5851454633 0.4148545367
## 710 Low 0.8010389126 0.1989610874
## 716 Low 0.8184036632 0.1815963368
## 719 Low 0.9428266773 0.0571733227
## 720 Low 0.9847960378 0.0152039622
## 725 Low 0.9818803020 0.0181196980
## 727 Low 0.5884750688 0.4115249312
## 730 Low 0.5611518183 0.4388481817
## 738 Low 0.9081879879 0.0918120121
## 745 Low 0.9170977223 0.0829022777
## 748 Low 0.8391627813 0.1608372187
## 751 Low 0.8963099718 0.1036900282
## 756 Low 0.6480019501 0.3519980499
## 766 Low 0.6647629476 0.3352370524
## 769 Low 0.6639046057 0.3360953943
## 783 Low 0.8863780609 0.1136219391
## 785 Low 0.9847467994 0.0152532006
## 790 Low 0.8248336297 0.1751663703
## 793 Low 0.8863780609 0.1136219391
## 795 Low 0.9897095855 0.0102904145
## 796 Low 0.9850516420 0.0149483580
## 797 Low 0.4811591379 0.5188408621
## 801 Low 0.7986667815 0.2013332185
## 811 Low 0.5989448209 0.4010551791
## 812 Low 0.8865111374 0.1134888626
## 815 Low 0.9820707051 0.0179292949
## 816 Low 0.8050772583 0.1949227417
## 817 Low 0.9939985174 0.0060014826
## 824 Low 0.8901108481 0.1098891519
## 825 Low 0.8901108481 0.1098891519
## 826 Low 0.8901108481 0.1098891519
## 830 Low 0.9863085584 0.0136914416
## 837 Low 0.9927031149 0.0072968851
## 838 Low 0.8050772583 0.1949227417
## 844 Low 0.9899820408 0.0100179592
## 845 Low 0.9980207841 0.0019792159
## 847 Low 0.9347533178 0.0652466822
## 850 Low 0.9018688685 0.0981311315
## 852 Low 0.9155921637 0.0844078363
## 853 Low 0.9155921637 0.0844078363
## 861 Low 0.9288460568 0.0711539432
## 868 Low 0.9983178675 0.0016821325
## 874 Low 0.9409936465 0.0590063535
## 879 High 0.0743114149 0.9256885851
## 895 High 0.0058470295 0.9941529705
## 899 High 0.0007541913 0.9992458087
## 903 High 0.0058470295 0.9941529705
## 917 High 0.0541474152 0.9458525848
## 927 High 0.0127138418 0.9872861582
## 929 High 0.0762280843 0.9237719157
## 931 High 0.0127138418 0.9872861582
## 933 High 0.3431573442 0.6568426558
## 944 High 0.0283606163 0.9716393837
## 947 High 0.0246356037 0.9753643963
## 949 High 0.1676976167 0.8323023833
## 953 High 0.0325221723 0.9674778277
## 958 High 0.5930878020 0.4069121980
## 961 High 0.0095523084 0.9904476916
## 963 High 0.0969115592 0.9030884408
## 964 High 0.0436230271 0.9563769729
## 973 High 0.0478986779 0.9521013221
## 976 High 0.0179846866 0.9820153134
## 977 High 0.1108113031 0.8891886969
## 980 High 0.3323177208 0.6676822792
## 983 High 0.6441963861 0.3558036139
## 984 High 0.1108113031 0.8891886969
## 986 High 0.0973381705 0.9026618295
## 989 High 0.1679011559 0.8320988441
## 991 High 0.0218893139 0.9781106861
## 996 High 0.0176152778 0.9823847222
## 997 High 0.4484937124 0.5515062876
## 999 High 0.0354283423 0.9645716577
## 1000 High 0.0598930376 0.9401069624
## 1003 High 0.0136419825 0.9863580175
## 1008 High 0.0805914802 0.9194085198
## 1009 High 0.4354389857 0.5645610143
## 1014 High 0.0282625766 0.9717374234
## 1015 High 0.4604541718 0.5395458282
## 1040 High 0.1834826180 0.8165173820
## 1042 High 0.3502196618 0.6497803382
## 1043 High 0.7301353926 0.2698646074
## 1050 High 0.1128584616 0.8871415384
## 1052 High 0.1326642328 0.8673357672
## 1056 High 0.4214253198 0.5785746802
## 1070 High 0.5016341979 0.4983658021
## 1073 High 0.5016785964 0.4983214036
## 1074 High 0.1240863502 0.8759136498
## 1079 High 0.4087109291 0.5912890709
## 1080 High 0.6775015397 0.3224984603
## 1085 High 0.1017757206 0.8982242794
## 1087 High 0.5255391386 0.4744608614
## 1096 High 0.8643165720 0.1356834280
## 1099 High 0.4099125986 0.5900874014
## 1100 High 0.7154999827 0.2845000173
## 1102 High 0.0887503601 0.9112496399
## 1107 Low 0.6788519384 0.3211480616
## 1109 Low 0.6292550701 0.3707449299
## 1114 Low 0.7123656715 0.2876343285
## 1118 Low 0.2889459081 0.7110540919
## 1123 Low 0.4860246521 0.5139753479
## 1132 Low 0.9313431964 0.0686568036
## 1134 Low 0.5204468329 0.4795531671
## 1137 Low 0.3215503018 0.6784496982
## 1154 Low 0.3215503018 0.6784496982
## 1155 Low 0.4988737937 0.5011262063
## 1157 Low 0.6870639639 0.3129360361
## 1162 Low 0.8692472243 0.1307527757
## 1164 Low 0.2100918801 0.7899081199
## 1171 Low 0.8963099718 0.1036900282
## 1172 Low 0.5491594648 0.4508405352
## 1175 Low 0.5561457281 0.4438542719
## 1177 Low 0.7164966875 0.2835033125
## 1179 Low 0.9138172674 0.0861827326
## 1183 Low 0.4123464196 0.5876535804
## 1185 Low 0.9073388166 0.0926611834
## 1189 Low 0.7755333295 0.2244666705
## 1211 Low 0.9600742008 0.0399257992
## 1218 Low 0.9898292244 0.0101707756
## 1224 Low 0.4045598255 0.5954401745
## 1225 Low 0.5884750688 0.4115249312
## 1227 Low 0.8791559614 0.1208440386
## 1232 Low 0.9872441389 0.0127558611
## 1235 Low 0.6380743568 0.3619256432
## 1238 Low 0.6179361327 0.3820638673
## 1240 Low 0.8199182266 0.1800817734
## 1241 Low 0.9600742008 0.0399257992
## 1248 Low 0.9863085584 0.0136914416
## 1258 Low 0.8050772583 0.1949227417
## 1261 Low 0.9027349430 0.0972650570
## 1263 Low 0.8901108481 0.1098891519
## 1269 Low 0.9568830828 0.0431169172
## 1270 Low 0.9882558677 0.0117441323
## 1271 Low 0.9018688685 0.0981311315
## 1272 Low 0.9155921637 0.0844078363
## 1280 Low 0.9409936465 0.0590063535
## 1286 Low 0.9990096848 0.0009903152
## 1287 Low 0.9990849676 0.0009150324
## 1289 Low 0.9599001175 0.0400998825
## 1290 Low 0.9155921637 0.0844078363
## 1291 High 0.1594073766 0.8405926234
## 1294 High 0.8044109182 0.1955890818
## 1305 Low 0.8477696040 0.1522303960
## 1308 High 0.6914165831 0.3085834169
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_LOG10T_Test$LR_LOG10T_Observed,
LR_LOG10T_Test_ROC predictor = LR_LOG10T_Test$LR_LOG10T_Predicted.High,
levels = rev(levels(LR_LOG10T_Test$LR_LOG10T_Observed)))
<- auc(LR_LOG10T_Test_ROC)[1]) (LR_LOG10T_Test_ROCCurveAUC
## [1] 0.8988237
##################################
# Applying numerical adjustments
# to eliminate zero values and
# natural logarithm function
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- log(PMA_PreModelling_Train[,i])
PMA_PreModelling_Train[,i]
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
<- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- log(PMA_PreModelling_Test[,i])
PMA_PreModelling_Test[,i]
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_LNT $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_LNT
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_LNT[,sapply(PMA_PreModelling_Train_LR_LNT, is.numeric)],
y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_LNT[,sapply(PMA_PreModelling_Train_LR_LNT, is.numeric)],
y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_LNT[,!names(PMA_PreModelling_Train_LR_LNT) %in% c("Log_Solubility_Class")],
LR_LNT_Tune y = PMA_PreModelling_Train_LR_LNT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_LNT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.891921 0.7492802 0.8245646
$finalModel LR_LNT_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 11.4167 1.2717 1.4241 -6.1614
## NumCarbon
## 0.4056
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 767.4 AIC: 777.4
$results LR_LNT_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.891921 0.7492802 0.8245646 0.03393011 0.06711081 0.06719134
<- LR_LNT_Tune$results$ROC) (LR_LNT_Train_ROCCurveAUC
## [1] 0.891921
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_LNT_Tune, scale = TRUE)
LR_LNT_VarImp plot(LR_LNT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_LNT $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_LNT
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_LNT[,sapply(PMA_PreModelling_Test_LR_LNT, is.numeric)],
y = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_LNT[,sapply(PMA_PreModelling_Test_LR_LNT, is.numeric)],
y = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_LNT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_LNT_Observed = PMA_PreModelling_Test_LR_LNT$Log_Solubility_Class,
LR_LNT_Test LR_LNT_Predicted = predict(LR_LNT_Tune,
!names(PMA_PreModelling_Test_LR_LNT) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_LNT[,type = "prob"))
LR_LNT_Test
## LR_LNT_Observed LR_LNT_Predicted.Low LR_LNT_Predicted.High
## 20 High 0.0114244517 0.9885755483
## 21 High 0.0080310909 0.9919689091
## 23 High 0.0354283423 0.9645716577
## 25 High 0.0113319188 0.9886680812
## 28 High 0.0603784377 0.9396215623
## 31 High 0.0126421937 0.9873578063
## 32 High 0.0204088121 0.9795911879
## 33 High 0.0401610347 0.9598389653
## 34 High 0.0401610347 0.9598389653
## 37 High 0.3543519894 0.6456480106
## 38 High 0.3543519894 0.6456480106
## 42 High 0.4516439293 0.5483560707
## 49 High 0.1326709535 0.8673290465
## 54 High 0.0127138418 0.9872861582
## 55 High 0.0171425230 0.9828574770
## 58 High 0.4508886389 0.5491113611
## 60 High 0.0458725899 0.9541274101
## 61 High 0.0401610347 0.9598389653
## 65 High 0.0422682755 0.9577317245
## 69 High 0.5360931241 0.4639068759
## 73 High 0.0313344498 0.9686655502
## 86 High 0.0379168490 0.9620831510
## 90 High 0.0246356037 0.9753643963
## 91 High 0.0112519351 0.9887480649
## 93 High 0.0246356037 0.9753643963
## 96 High 0.0112519351 0.9887480649
## 98 High 0.0281163829 0.9718836171
## 100 High 0.0455991395 0.9544008605
## 104 High 0.8596843413 0.1403156587
## 112 High 0.1899203318 0.8100796682
## 115 High 0.8830092589 0.1169907411
## 119 High 0.1145296081 0.8854703919
## 128 High 0.1145296081 0.8854703919
## 130 High 0.0186594977 0.9813405023
## 139 High 0.0186594977 0.9813405023
## 143 High 0.0369097104 0.9630902896
## 145 High 0.0632876797 0.9367123203
## 146 High 0.1145296081 0.8854703919
## 149 High 0.1330597875 0.8669402125
## 150 High 0.0881581152 0.9118418848
## 152 High 0.0401610347 0.9598389653
## 157 High 0.5491594648 0.4508405352
## 161 High 0.1818965999 0.8181034001
## 162 High 0.0114108307 0.9885891693
## 166 High 0.5306213331 0.4693786669
## 167 High 0.1130629002 0.8869370998
## 173 High 0.0762280843 0.9237719157
## 176 High 0.1145296081 0.8854703919
## 182 High 0.0040896523 0.9959103477
## 187 High 0.0411811702 0.9588188298
## 190 High 0.0232842897 0.9767157103
## 194 High 0.0175444271 0.9824555729
## 195 High 0.1108113031 0.8891886969
## 201 High 0.0436230271 0.9563769729
## 207 High 0.0716468222 0.9283531778
## 208 High 0.5395740091 0.4604259909
## 215 High 0.0126421937 0.9873578063
## 222 High 0.1592460118 0.8407539882
## 224 High 0.2064348880 0.7935651120
## 231 High 0.7632446817 0.2367553183
## 236 High 0.0866704378 0.9133295622
## 237 High 0.0136419825 0.9863580175
## 240 High 0.2146524994 0.7853475006
## 243 High 0.0436230271 0.9563769729
## 248 High 0.1108113031 0.8891886969
## 251 High 0.6961143000 0.3038857000
## 256 High 0.3070340842 0.6929659158
## 258 High 0.1230410001 0.8769589999
## 262 High 0.5395740091 0.4604259909
## 266 High 0.4571965619 0.5428034381
## 272 High 0.5720886850 0.4279113150
## 280 High 0.3082654815 0.6917345185
## 283 High 0.4354389857 0.5645610143
## 286 High 0.4391327329 0.5608672671
## 287 High 0.1041477820 0.8958522180
## 289 High 0.1437887272 0.8562112728
## 290 High 0.3282018426 0.6717981574
## 298 High 0.2939848627 0.7060151373
## 305 High 0.2492525888 0.7507474112
## 306 High 0.2398758877 0.7601241123
## 312 High 0.0973381705 0.9026618295
## 320 High 0.3058707588 0.6941292412
## 325 High 0.1771522624 0.8228477376
## 332 High 0.0556061812 0.9443938188
## 333 High 0.4847787185 0.5152212815
## 335 High 0.2939848627 0.7060151373
## 339 High 0.8371427968 0.1628572032
## 346 High 0.2382954634 0.7617045366
## 347 High 0.0327311005 0.9672688995
## 350 High 0.4097345492 0.5902654508
## 353 High 0.2663701600 0.7336298400
## 358 High 0.2182251914 0.7817748086
## 365 High 0.1922987093 0.8077012907
## 367 High 0.2058056954 0.7941943046
## 370 High 0.0651543422 0.9348456578
## 379 High 0.1327838973 0.8672161027
## 386 High 0.5247916818 0.4752083182
## 394 High 0.4697661230 0.5302338770
## 396 High 0.1215635914 0.8784364086
## 400 High 0.0136419825 0.9863580175
## 404 High 0.0493865148 0.9506134852
## 405 High 0.6323780570 0.3676219430
## 413 High 0.0931860048 0.9068139952
## 415 High 0.3133445051 0.6866554949
## 417 High 0.2643581720 0.7356418280
## 418 High 0.3679591512 0.6320408488
## 423 High 0.2714888903 0.7285111097
## 434 High 0.1877329621 0.8122670379
## 437 High 0.3396836347 0.6603163653
## 440 High 0.4302378412 0.5697621588
## 449 High 0.3642274184 0.6357725816
## 450 High 0.2638891235 0.7361108765
## 457 High 0.3642274184 0.6357725816
## 467 High 0.3843344069 0.6156655931
## 469 High 0.3427771468 0.6572228532
## 474 High 0.9448977678 0.0551022322
## 475 High 0.9069691143 0.0930308857
## 485 High 0.1206262001 0.8793737999
## 504 Low 0.2810994647 0.7189005353
## 511 Low 0.7713879404 0.2286120596
## 512 Low 0.3771323094 0.6228676906
## 517 Low 0.0796952208 0.9203047792
## 519 Low 0.6631915667 0.3368084333
## 520 Low 0.0365399927 0.9634600073
## 522 Low 0.9567591567 0.0432408433
## 527 Low 0.5472028728 0.4527971272
## 528 Low 0.4455816601 0.5544183399
## 529 Low 0.2855557521 0.7144442479
## 537 Low 0.0818301918 0.9181698082
## 540 Low 0.9324385799 0.0675614201
## 541 Low 0.7941384873 0.2058615127
## 547 Low 0.9225916586 0.0774083414
## 550 Low 0.7281742163 0.2718257837
## 555 Low 0.4391059978 0.5608940022
## 564 Low 0.0179846866 0.9820153134
## 570 Low 0.3080507810 0.6919492190
## 573 Low 0.2060658806 0.7939341194
## 575 Low 0.7941384873 0.2058615127
## 578 Low 0.2347216890 0.7652783110
## 581 Low 0.2060658806 0.7939341194
## 585 Low 0.3173015329 0.6826984671
## 590 Low 0.6875129427 0.3124870573
## 601 Low 0.7826540228 0.2173459772
## 602 Low 0.6143605885 0.3856394115
## 607 Low 0.4573521726 0.5426478274
## 610 Low 0.4429636059 0.5570363941
## 618 Low 0.7158484958 0.2841515042
## 624 Low 0.3173015329 0.6826984671
## 626 Low 0.1600193605 0.8399806395
## 627 Low 0.3215503018 0.6784496982
## 634 Low 0.6883858742 0.3116141258
## 640 Low 0.9844093361 0.0155906639
## 642 Low 0.2100918801 0.7899081199
## 643 Low 0.8692472243 0.1307527757
## 644 Low 0.8718338373 0.1281661627
## 645 Low 0.7731505226 0.2268494774
## 646 Low 0.9397436656 0.0602563344
## 647 Low 0.7401939612 0.2598060388
## 652 Low 0.1584162203 0.8415837797
## 658 Low 0.5345180440 0.4654819560
## 659 Low 0.9685770972 0.0314229028
## 660 Low 0.8148166346 0.1851833654
## 664 Low 0.3515609710 0.6484390290
## 666 Low 0.4641158585 0.5358841415
## 667 Low 0.8356306242 0.1643693758
## 675 Low 0.8692472243 0.1307527757
## 680 Low 0.9606395556 0.0393604444
## 681 Low 0.8406617830 0.1593382170
## 687 Low 0.9754003752 0.0245996248
## 694 Low 0.6710783028 0.3289216972
## 697 Low 0.5964862294 0.4035137706
## 701 Low 0.5884750688 0.4115249312
## 705 Low 0.8954203536 0.1045796464
## 707 Low 0.5851454633 0.4148545367
## 710 Low 0.8010389126 0.1989610874
## 716 Low 0.8184036632 0.1815963368
## 719 Low 0.9428266773 0.0571733227
## 720 Low 0.9847960378 0.0152039622
## 725 Low 0.9818803020 0.0181196980
## 727 Low 0.5884750688 0.4115249312
## 730 Low 0.5611518183 0.4388481817
## 738 Low 0.9081879879 0.0918120121
## 745 Low 0.9170977223 0.0829022777
## 748 Low 0.8391627813 0.1608372187
## 751 Low 0.8963099718 0.1036900282
## 756 Low 0.6480019501 0.3519980499
## 766 Low 0.6647629476 0.3352370524
## 769 Low 0.6639046057 0.3360953943
## 783 Low 0.8863780609 0.1136219391
## 785 Low 0.9847467994 0.0152532006
## 790 Low 0.8248336297 0.1751663703
## 793 Low 0.8863780609 0.1136219391
## 795 Low 0.9897095855 0.0102904145
## 796 Low 0.9850516420 0.0149483580
## 797 Low 0.4811591379 0.5188408621
## 801 Low 0.7986667815 0.2013332185
## 811 Low 0.5989448209 0.4010551791
## 812 Low 0.8865111374 0.1134888626
## 815 Low 0.9820707051 0.0179292949
## 816 Low 0.8050772583 0.1949227417
## 817 Low 0.9939985174 0.0060014826
## 824 Low 0.8901108481 0.1098891519
## 825 Low 0.8901108481 0.1098891519
## 826 Low 0.8901108481 0.1098891519
## 830 Low 0.9863085584 0.0136914416
## 837 Low 0.9927031149 0.0072968851
## 838 Low 0.8050772583 0.1949227417
## 844 Low 0.9899820408 0.0100179592
## 845 Low 0.9980207841 0.0019792159
## 847 Low 0.9347533178 0.0652466822
## 850 Low 0.9018688685 0.0981311315
## 852 Low 0.9155921637 0.0844078363
## 853 Low 0.9155921637 0.0844078363
## 861 Low 0.9288460568 0.0711539432
## 868 Low 0.9983178675 0.0016821325
## 874 Low 0.9409936465 0.0590063535
## 879 High 0.0743114149 0.9256885851
## 895 High 0.0058470295 0.9941529705
## 899 High 0.0007541913 0.9992458087
## 903 High 0.0058470295 0.9941529705
## 917 High 0.0541474152 0.9458525848
## 927 High 0.0127138418 0.9872861582
## 929 High 0.0762280843 0.9237719157
## 931 High 0.0127138418 0.9872861582
## 933 High 0.3431573442 0.6568426558
## 944 High 0.0283606163 0.9716393837
## 947 High 0.0246356037 0.9753643963
## 949 High 0.1676976167 0.8323023833
## 953 High 0.0325221723 0.9674778277
## 958 High 0.5930878020 0.4069121980
## 961 High 0.0095523084 0.9904476916
## 963 High 0.0969115592 0.9030884408
## 964 High 0.0436230271 0.9563769729
## 973 High 0.0478986779 0.9521013221
## 976 High 0.0179846866 0.9820153134
## 977 High 0.1108113031 0.8891886969
## 980 High 0.3323177208 0.6676822792
## 983 High 0.6441963861 0.3558036139
## 984 High 0.1108113031 0.8891886969
## 986 High 0.0973381705 0.9026618295
## 989 High 0.1679011559 0.8320988441
## 991 High 0.0218893139 0.9781106861
## 996 High 0.0176152778 0.9823847222
## 997 High 0.4484937124 0.5515062876
## 999 High 0.0354283423 0.9645716577
## 1000 High 0.0598930376 0.9401069624
## 1003 High 0.0136419825 0.9863580175
## 1008 High 0.0805914802 0.9194085198
## 1009 High 0.4354389857 0.5645610143
## 1014 High 0.0282625766 0.9717374234
## 1015 High 0.4604541718 0.5395458282
## 1040 High 0.1834826180 0.8165173820
## 1042 High 0.3502196618 0.6497803382
## 1043 High 0.7301353926 0.2698646074
## 1050 High 0.1128584616 0.8871415384
## 1052 High 0.1326642328 0.8673357672
## 1056 High 0.4214253198 0.5785746802
## 1070 High 0.5016341979 0.4983658021
## 1073 High 0.5016785964 0.4983214036
## 1074 High 0.1240863502 0.8759136498
## 1079 High 0.4087109291 0.5912890709
## 1080 High 0.6775015397 0.3224984603
## 1085 High 0.1017757206 0.8982242794
## 1087 High 0.5255391386 0.4744608614
## 1096 High 0.8643165720 0.1356834280
## 1099 High 0.4099125986 0.5900874014
## 1100 High 0.7154999827 0.2845000173
## 1102 High 0.0887503601 0.9112496399
## 1107 Low 0.6788519384 0.3211480616
## 1109 Low 0.6292550701 0.3707449299
## 1114 Low 0.7123656715 0.2876343285
## 1118 Low 0.2889459081 0.7110540919
## 1123 Low 0.4860246521 0.5139753479
## 1132 Low 0.9313431964 0.0686568036
## 1134 Low 0.5204468329 0.4795531671
## 1137 Low 0.3215503018 0.6784496982
## 1154 Low 0.3215503018 0.6784496982
## 1155 Low 0.4988737937 0.5011262063
## 1157 Low 0.6870639639 0.3129360361
## 1162 Low 0.8692472243 0.1307527757
## 1164 Low 0.2100918801 0.7899081199
## 1171 Low 0.8963099718 0.1036900282
## 1172 Low 0.5491594648 0.4508405352
## 1175 Low 0.5561457281 0.4438542719
## 1177 Low 0.7164966875 0.2835033125
## 1179 Low 0.9138172674 0.0861827326
## 1183 Low 0.4123464196 0.5876535804
## 1185 Low 0.9073388166 0.0926611834
## 1189 Low 0.7755333295 0.2244666705
## 1211 Low 0.9600742008 0.0399257992
## 1218 Low 0.9898292244 0.0101707756
## 1224 Low 0.4045598255 0.5954401745
## 1225 Low 0.5884750688 0.4115249312
## 1227 Low 0.8791559614 0.1208440386
## 1232 Low 0.9872441389 0.0127558611
## 1235 Low 0.6380743568 0.3619256432
## 1238 Low 0.6179361327 0.3820638673
## 1240 Low 0.8199182266 0.1800817734
## 1241 Low 0.9600742008 0.0399257992
## 1248 Low 0.9863085584 0.0136914416
## 1258 Low 0.8050772583 0.1949227417
## 1261 Low 0.9027349430 0.0972650570
## 1263 Low 0.8901108481 0.1098891519
## 1269 Low 0.9568830828 0.0431169172
## 1270 Low 0.9882558677 0.0117441323
## 1271 Low 0.9018688685 0.0981311315
## 1272 Low 0.9155921637 0.0844078363
## 1280 Low 0.9409936465 0.0590063535
## 1286 Low 0.9990096848 0.0009903152
## 1287 Low 0.9990849676 0.0009150324
## 1289 Low 0.9599001175 0.0400998825
## 1290 Low 0.9155921637 0.0844078363
## 1291 High 0.1594073766 0.8405926234
## 1294 High 0.8044109182 0.1955890818
## 1305 Low 0.8477696040 0.1522303960
## 1308 High 0.6914165831 0.3085834169
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_LNT_Test$LR_LNT_Observed,
LR_LNT_Test_ROC predictor = LR_LNT_Test$LR_LNT_Predicted.High,
levels = rev(levels(LR_LNT_Test$LR_LNT_Observed)))
<- auc(LR_LNT_Test_ROC)[1]) (LR_LNT_Test_ROCCurveAUC
## [1] 0.8988237
##################################
# Applying square root function
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- PMA_PreModelling_Train[,i] + 1
PMA_PreModelling_Train[,i] <- sqrt(PMA_PreModelling_Train[,i])
PMA_PreModelling_Train[,i]
}
for (i in 1:(ncol(PMA_PreModelling_Test)-1)){
<- PMA_PreModelling_Test[,i] + 1
PMA_PreModelling_Test[,i] <- sqrt(PMA_PreModelling_Test[,i])
PMA_PreModelling_Test[,i]
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
<- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_SRT $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_SRT
##################################
# Treating data outliers
# for the train set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_SRT[,sapply(PMA_PreModelling_Train_LR_SRT, is.numeric)],
y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_SRT[,sapply(PMA_PreModelling_Train_LR_SRT, is.numeric)],
y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_SRT[,!names(PMA_PreModelling_Train_LR_SRT) %in% c("Log_Solubility_Class")],
LR_SRT_Tune y = PMA_PreModelling_Train_LR_SRT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_SRT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8844149 0.7163344 0.830225
$finalModel LR_SRT_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 6.2882 2.7830 0.5132 -2.7168
## NumCarbon
## -0.3034
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 800.3 AIC: 810.3
$results LR_SRT_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8844149 0.7163344 0.830225 0.03385158 0.07539679 0.06335689
<- LR_SRT_Tune$results$ROC) (LR_SRT_Train_ROCCurveAUC
## [1] 0.8844149
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_SRT_Tune, scale = TRUE)
LR_SRT_VarImp plot(LR_SRT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
<- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_SRT $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_SRT
##################################
# Treating data outliers
# for the test set
##################################
# No actions applied
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_SRT[,sapply(PMA_PreModelling_Test_LR_SRT, is.numeric)],
y = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_SRT[,sapply(PMA_PreModelling_Test_LR_SRT, is.numeric)],
y = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SRT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_SRT_Observed = PMA_PreModelling_Test_LR_SRT$Log_Solubility_Class,
LR_SRT_Test LR_SRT_Predicted = predict(LR_SRT_Tune,
!names(PMA_PreModelling_Test_LR_SRT) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_SRT[,type = "prob"))
LR_SRT_Test
## LR_SRT_Observed LR_SRT_Predicted.Low LR_SRT_Predicted.High
## 20 High 0.0171776680 0.982822332
## 21 High 0.0094733300 0.990526670
## 23 High 0.0429512262 0.957048774
## 25 High 0.0166962209 0.983303779
## 28 High 0.0615130927 0.938486907
## 31 High 0.0346712284 0.965328772
## 32 High 0.0572263673 0.942773633
## 33 High 0.0870345720 0.912965428
## 34 High 0.0870345720 0.912965428
## 37 High 0.3518170970 0.648182903
## 38 High 0.3518170970 0.648182903
## 42 High 0.5218061033 0.478193897
## 49 High 0.1799027090 0.820097291
## 54 High 0.0328468786 0.967153121
## 55 High 0.0151348891 0.984865111
## 58 High 0.4317935721 0.568206428
## 60 High 0.0847834160 0.915216584
## 61 High 0.0870345720 0.912965428
## 65 High 0.0573837578 0.942616242
## 69 High 0.5187323178 0.481267682
## 73 High 0.0088225257 0.991177474
## 86 High 0.0731684575 0.926831543
## 90 High 0.0538625246 0.946137475
## 91 High 0.0064373829 0.993562617
## 93 High 0.0538625246 0.946137475
## 96 High 0.0064373829 0.993562617
## 98 High 0.0598648467 0.940135153
## 100 High 0.0719361787 0.928063821
## 104 High 0.9099158472 0.090084153
## 112 High 0.2285023414 0.771497659
## 115 High 0.8750053683 0.124994632
## 119 High 0.1622573557 0.837742644
## 128 High 0.1622573557 0.837742644
## 130 High 0.0161060102 0.983893990
## 139 High 0.0161060102 0.983893990
## 143 High 0.0417491435 0.958250856
## 145 High 0.1137434063 0.886256594
## 146 High 0.1622573557 0.837742644
## 149 High 0.1593165890 0.840683411
## 150 High 0.1398798250 0.860120175
## 152 High 0.0870345720 0.912965428
## 157 High 0.5055646637 0.494435336
## 161 High 0.2398674752 0.760132525
## 162 High 0.0066302108 0.993369789
## 166 High 0.5237831608 0.476216839
## 167 High 0.1651313064 0.834868694
## 173 High 0.1259453369 0.874054663
## 176 High 0.1622573557 0.837742644
## 182 High 0.0222998583 0.977700142
## 187 High 0.0370880559 0.962911944
## 190 High 0.0134601255 0.986539874
## 194 High 0.0291152071 0.970884793
## 195 High 0.1750006699 0.824999330
## 201 High 0.0832309264 0.916769074
## 207 High 0.1222726588 0.877727341
## 208 High 0.4877656097 0.512234390
## 215 High 0.0346712284 0.965328772
## 222 High 0.2353885107 0.764611489
## 224 High 0.1922505225 0.807749477
## 231 High 0.7189699421 0.281030058
## 236 High 0.1056715454 0.894328455
## 237 High 0.0456519036 0.954348096
## 240 High 0.2128438968 0.787156103
## 243 High 0.0832309264 0.916769074
## 248 High 0.1750006699 0.824999330
## 251 High 0.7704009245 0.229599076
## 256 High 0.3841407732 0.615859227
## 258 High 0.1701526065 0.829847393
## 262 High 0.4877656097 0.512234390
## 266 High 0.4606389841 0.539361016
## 272 High 0.5616128480 0.438387152
## 280 High 0.3111471891 0.688852811
## 283 High 0.4319894761 0.568010524
## 286 High 0.4462380183 0.553761982
## 287 High 0.1669680937 0.833031906
## 289 High 0.1191373826 0.880862617
## 290 High 0.3312408721 0.668759128
## 298 High 0.3172571574 0.682742843
## 305 High 0.2944486733 0.705551327
## 306 High 0.1943816153 0.805618385
## 312 High 0.0989000206 0.901099979
## 320 High 0.3131330745 0.686866926
## 325 High 0.1824338561 0.817566144
## 332 High 0.0623662697 0.937633730
## 333 High 0.4443815794 0.555618421
## 335 High 0.3172571574 0.682742843
## 339 High 0.7526842928 0.247315707
## 346 High 0.3267166104 0.673283390
## 347 High 0.0533733809 0.946626619
## 350 High 0.3802631772 0.619736823
## 353 High 0.3335196993 0.666480301
## 358 High 0.3045483863 0.695451614
## 365 High 0.2268214855 0.773178514
## 367 High 0.1951627467 0.804837253
## 370 High 0.0351733764 0.964826624
## 379 High 0.1045430079 0.895456992
## 386 High 0.3769454782 0.623054522
## 394 High 0.5261865734 0.473813427
## 396 High 0.1296532429 0.870346757
## 400 High 0.0456519036 0.954348096
## 404 High 0.0482692739 0.951730726
## 405 High 0.5987312318 0.401268768
## 413 High 0.1269081773 0.873091823
## 415 High 0.3684495301 0.631550470
## 417 High 0.2322587979 0.767741202
## 418 High 0.4327304582 0.567269542
## 423 High 0.2911428364 0.708857164
## 434 High 0.2665859841 0.733414016
## 437 High 0.2526126299 0.747387370
## 440 High 0.3719172425 0.628082757
## 449 High 0.3790499525 0.620950048
## 450 High 0.2818076021 0.718192398
## 457 High 0.3790499525 0.620950048
## 467 High 0.3100685455 0.689931454
## 469 High 0.1599358543 0.840064146
## 474 High 0.9353877385 0.064612262
## 475 High 0.9012220999 0.098777900
## 485 High 0.1079410717 0.892058928
## 504 Low 0.1987091431 0.801290857
## 511 Low 0.7040002637 0.295999736
## 512 Low 0.4183441677 0.581655832
## 517 Low 0.0403857448 0.959614255
## 519 Low 0.7193762711 0.280623729
## 520 Low 0.0818263209 0.918173679
## 522 Low 0.9390148916 0.060985108
## 527 Low 0.6225909545 0.377409046
## 528 Low 0.3219492221 0.678050778
## 529 Low 0.3079150489 0.692084951
## 537 Low 0.1326610454 0.867338955
## 540 Low 0.9212090082 0.078790992
## 541 Low 0.5642976565 0.435702343
## 547 Low 0.9188139959 0.081186004
## 550 Low 0.5753601845 0.424639815
## 555 Low 0.5010132937 0.498986706
## 564 Low 0.0412981629 0.958701837
## 570 Low 0.3533886536 0.646611346
## 573 Low 0.2380917144 0.761908286
## 575 Low 0.5642976565 0.435702343
## 578 Low 0.2113833669 0.788616633
## 581 Low 0.2380917144 0.761908286
## 585 Low 0.3217351998 0.678264800
## 590 Low 0.7367369732 0.263263027
## 601 Low 0.8572092918 0.142790708
## 602 Low 0.6397712472 0.360228753
## 607 Low 0.5539816043 0.446018396
## 610 Low 0.5411389615 0.458861038
## 618 Low 0.7487472889 0.251252711
## 624 Low 0.3217351998 0.678264800
## 626 Low 0.2312349798 0.768765020
## 627 Low 0.3353599489 0.664640051
## 634 Low 0.6280533141 0.371946686
## 640 Low 0.9879769391 0.012023061
## 642 Low 0.1928813704 0.807118630
## 643 Low 0.6470732658 0.352926734
## 644 Low 0.8524420814 0.147557919
## 645 Low 0.7576589388 0.242341061
## 646 Low 0.7766825260 0.223317474
## 647 Low 0.7768079181 0.223192082
## 652 Low 0.1992101897 0.800789810
## 658 Low 0.5871468529 0.412853147
## 659 Low 0.8531651594 0.146834841
## 660 Low 0.8805707118 0.119429288
## 664 Low 0.3430474939 0.656952506
## 666 Low 0.4365034540 0.563496546
## 667 Low 0.8629695776 0.137030422
## 675 Low 0.6470732658 0.352926734
## 680 Low 0.9839734149 0.016026585
## 681 Low 0.8925205505 0.107479449
## 687 Low 0.8732490620 0.126750938
## 694 Low 0.7386214676 0.261378532
## 697 Low 0.5743225447 0.425677455
## 701 Low 0.3916949690 0.608305031
## 705 Low 0.9524371969 0.047562803
## 707 Low 0.6657996248 0.334200375
## 710 Low 0.7143156897 0.285684310
## 716 Low 0.8822175146 0.117782485
## 719 Low 0.9406540923 0.059345908
## 720 Low 0.9780703756 0.021929624
## 725 Low 0.9799007961 0.020099204
## 727 Low 0.3916949690 0.608305031
## 730 Low 0.5185168218 0.481483178
## 738 Low 0.8416972687 0.158302731
## 745 Low 0.7186886650 0.281311335
## 748 Low 0.7852185869 0.214781413
## 751 Low 0.9427266890 0.057273311
## 756 Low 0.7000527225 0.299947278
## 766 Low 0.7710747011 0.228925299
## 769 Low 0.5964903375 0.403509663
## 783 Low 0.8502174908 0.149782509
## 785 Low 0.9090787984 0.090921202
## 790 Low 0.8886512925 0.111348707
## 793 Low 0.8502174908 0.149782509
## 795 Low 0.9915592821 0.008440718
## 796 Low 0.9837678710 0.016232129
## 797 Low 0.5608208122 0.439179188
## 801 Low 0.7642200416 0.235779958
## 811 Low 0.5397913514 0.460208649
## 812 Low 0.9363960470 0.063603953
## 815 Low 0.9586825149 0.041317485
## 816 Low 0.7804673310 0.219532669
## 817 Low 0.9566695569 0.043330443
## 824 Low 0.8709603992 0.129039601
## 825 Low 0.8709603992 0.129039601
## 826 Low 0.8709603992 0.129039601
## 830 Low 0.9170635243 0.082936476
## 837 Low 0.9480386701 0.051961330
## 838 Low 0.7804673310 0.219532669
## 844 Low 0.9343242730 0.065675727
## 845 Low 0.9833504460 0.016649554
## 847 Low 0.9315521332 0.068447867
## 850 Low 0.8919647214 0.108035279
## 852 Low 0.9110529484 0.088947052
## 853 Low 0.9110529484 0.088947052
## 861 Low 0.9275671791 0.072432821
## 868 Low 0.9859660498 0.014033950
## 874 Low 0.9416548705 0.058345129
## 879 High 0.1190842356 0.880915764
## 895 High 0.0185473665 0.981452634
## 899 High 0.0002913624 0.999708638
## 903 High 0.0185473665 0.981452634
## 917 High 0.0406554150 0.959344585
## 927 High 0.0328468786 0.967153121
## 929 High 0.1259453369 0.874054663
## 931 High 0.0328468786 0.967153121
## 933 High 0.3687867543 0.631213246
## 944 High 0.0424837446 0.957516255
## 947 High 0.0538625246 0.946137475
## 949 High 0.1819006993 0.818099301
## 953 High 0.0238344707 0.976165529
## 958 High 0.5223971717 0.477602828
## 961 High 0.0171642639 0.982835736
## 963 High 0.1328530189 0.867146981
## 964 High 0.0832309264 0.916769074
## 973 High 0.0517005568 0.948299443
## 976 High 0.0412981629 0.958701837
## 977 High 0.1750006699 0.824999330
## 980 High 0.2633610863 0.736638914
## 983 High 0.6133383461 0.386661654
## 984 High 0.1750006699 0.824999330
## 986 High 0.0989000206 0.901099979
## 989 High 0.2084058286 0.791594171
## 991 High 0.0254199950 0.974580005
## 996 High 0.0528955326 0.947104467
## 997 High 0.4403169291 0.559683071
## 999 High 0.0429512262 0.957048774
## 1000 High 0.0525655462 0.947434454
## 1003 High 0.0456519036 0.954348096
## 1008 High 0.0967426835 0.903257316
## 1009 High 0.4319894761 0.568010524
## 1014 High 0.0429687572 0.957031243
## 1015 High 0.4919848875 0.508015113
## 1040 High 0.1777604857 0.822239514
## 1042 High 0.3526828399 0.647317160
## 1043 High 0.7096112136 0.290388786
## 1050 High 0.1085884359 0.891411564
## 1052 High 0.1792599161 0.820740084
## 1056 High 0.1318850763 0.868114924
## 1070 High 0.5591896277 0.440810372
## 1073 High 0.4657303100 0.534269690
## 1074 High 0.1749873003 0.825012700
## 1079 High 0.4074777876 0.592522212
## 1080 High 0.5848037333 0.415196267
## 1085 High 0.0954182731 0.904581727
## 1087 High 0.6230391933 0.376960807
## 1096 High 0.9185149388 0.081485061
## 1099 High 0.4496281245 0.550371875
## 1100 High 0.6737379747 0.326262025
## 1102 High 0.0589896088 0.941010391
## 1107 Low 0.4722318025 0.527768198
## 1109 Low 0.6900698356 0.309930164
## 1114 Low 0.5024548018 0.497545198
## 1118 Low 0.3450298942 0.654970106
## 1123 Low 0.4711940701 0.528805930
## 1132 Low 0.8749560859 0.125043914
## 1134 Low 0.5948511032 0.405148897
## 1137 Low 0.3353599489 0.664640051
## 1154 Low 0.3353599489 0.664640051
## 1155 Low 0.5731491992 0.426850801
## 1157 Low 0.7851027520 0.214897248
## 1162 Low 0.6470732658 0.352926734
## 1164 Low 0.1928813704 0.807118630
## 1171 Low 0.9427266890 0.057273311
## 1172 Low 0.5055646637 0.494435336
## 1175 Low 0.6405974664 0.359402534
## 1177 Low 0.6626098078 0.337390192
## 1179 Low 0.9406173547 0.059382645
## 1183 Low 0.2979985007 0.702001499
## 1185 Low 0.9489202020 0.051079798
## 1189 Low 0.8642810191 0.135718981
## 1211 Low 0.8242523053 0.175747695
## 1218 Low 0.9938658969 0.006134103
## 1224 Low 0.3956745407 0.604325459
## 1225 Low 0.3916949690 0.608305031
## 1227 Low 0.9249686768 0.075031323
## 1232 Low 0.9889737070 0.011026293
## 1235 Low 0.7256637224 0.274336278
## 1238 Low 0.7264484488 0.273551551
## 1240 Low 0.8721554250 0.127844575
## 1241 Low 0.8242523053 0.175747695
## 1248 Low 0.9170635243 0.082936476
## 1258 Low 0.7804673310 0.219532669
## 1261 Low 0.8974384750 0.102561525
## 1263 Low 0.8709603992 0.129039601
## 1269 Low 0.9605579303 0.039442070
## 1270 Low 0.9897858307 0.010214169
## 1271 Low 0.8919647214 0.108035279
## 1272 Low 0.9110529484 0.088947052
## 1280 Low 0.9416548705 0.058345129
## 1286 Low 0.9913833133 0.008616687
## 1287 Low 0.9920841459 0.007915854
## 1289 Low 0.9625961144 0.037403886
## 1290 Low 0.9110529484 0.088947052
## 1291 High 0.1741556081 0.825844392
## 1294 High 0.7811147653 0.218885235
## 1305 Low 0.9179685932 0.082031407
## 1308 High 0.7657936357 0.234206364
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_SRT_Test$LR_SRT_Observed,
LR_SRT_Test_ROC predictor = LR_SRT_Test$LR_SRT_Predicted.High,
levels = rev(levels(LR_SRT_Test$LR_SRT_Observed)))
<- auc(LR_SRT_Test_ROC)[1]) (LR_SRT_Test_ROCCurveAUC
## [1] 0.8946198
##################################
# Applying winsorization function
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
for (i in 1:(ncol(PMA_PreModelling_Train)-1)){
<- quantile(PMA_PreModelling_Train[,i], 0.90)
Predictor_Percentile90 <- quantile(PMA_PreModelling_Train[,i], 0.10)
Predictor_Percentile10 <- quantile(PMA_PreModelling_Train[,i], 0.75)
Predictor_Percentile75 <- quantile(PMA_PreModelling_Train[,i], 0.25)
Predictor_Percentile25 <- Predictor_Percentile75-Predictor_Percentile25
Predictor_IQR <- Predictor_Percentile75 + (1.5*Predictor_IQR)
Predictor_Outlier_UCL <- Predictor_Percentile25 - (1.5*Predictor_IQR)
Predictor_Outlier_LCL <- ifelse(PMA_PreModelling_Train[,i]>Predictor_Outlier_UCL,Predictor_Percentile90,
PMA_PreModelling_Train[,i] ifelse(PMA_PreModelling_Train[,i]<Predictor_Outlier_LCL,Predictor_Percentile10,
PMA_PreModelling_Train[,i]))<- ifelse(PMA_PreModelling_Test[,i]>Predictor_Outlier_UCL,Predictor_Percentile90,
PMA_PreModelling_Test[,i] ifelse(PMA_PreModelling_Test[,i]<Predictor_Outlier_LCL,Predictor_Percentile10,
PMA_PreModelling_Test[,i]))
}
##################################
# Creating a local object
# for the train and test sets
##################################
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
<- PMA_PreModelling_Train_LR
PMA_PreModelling_Train_LR_WT $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_WT
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_WT[,sapply(PMA_PreModelling_Train_LR_WT, is.numeric)],
y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_WT[,sapply(PMA_PreModelling_Train_LR_WT, is.numeric)],
y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_WT$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_WT[,!names(PMA_PreModelling_Train_LR_WT) %in% c("Log_Solubility_Class")],
LR_WT_Tune y = PMA_PreModelling_Train_LR_WT$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_WT_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8742874 0.7024363 0.8359216
$finalModel LR_WT_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 4.21806 1.35203 0.05381 -0.30758
## NumCarbon
## -0.11897
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 834.5 AIC: 844.5
$results LR_WT_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8742874 0.7024363 0.8359216 0.03831246 0.06100711 0.06260559
<- LR_WT_Tune$results$ROC) (LR_WT_Train_ROCCurveAUC
## [1] 0.8742874
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_WT_Tune, scale = TRUE)
LR_WT_VarImp plot(LR_WT_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
<- PMA_PreModelling_Test_LR
PMA_PreModelling_Test_LR_WT $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_WT
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_WT[,sapply(PMA_PreModelling_Test_LR_WT, is.numeric)],
y = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_WT[,sapply(PMA_PreModelling_Test_LR_WT, is.numeric)],
y = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_WT Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_WT$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_WT_Observed = PMA_PreModelling_Test_LR_WT$Log_Solubility_Class,
LR_WT_Test LR_WT_Predicted = predict(LR_WT_Tune,
!names(PMA_PreModelling_Test_LR_WT) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_WT[,type = "prob"))
LR_WT_Test
## LR_WT_Observed LR_WT_Predicted.Low LR_WT_Predicted.High
## 20 High 0.028252639 0.971747361
## 21 High 0.011032277 0.988967723
## 23 High 0.050174667 0.949825333
## 25 High 0.021395292 0.978604708
## 28 High 0.063987262 0.936012738
## 31 High 0.069295112 0.930704888
## 32 High 0.100841511 0.899158489
## 33 High 0.126560560 0.873439440
## 34 High 0.126560560 0.873439440
## 37 High 0.327406425 0.672593575
## 38 High 0.327406425 0.672593575
## 42 High 0.551750285 0.448249715
## 49 High 0.201947838 0.798052162
## 54 High 0.059988529 0.940011471
## 55 High 0.011554148 0.988445852
## 58 High 0.400584308 0.599415692
## 60 High 0.126557798 0.873442202
## 61 High 0.126560560 0.873439440
## 65 High 0.077399182 0.922600818
## 69 High 0.478379139 0.521620861
## 73 High 0.042117210 0.957882790
## 86 High 0.111173578 0.888826422
## 90 High 0.086098117 0.913901883
## 91 High 0.024835727 0.975164273
## 93 High 0.086098117 0.913901883
## 96 High 0.024835727 0.975164273
## 98 High 0.094952344 0.905047656
## 100 High 0.101736634 0.898263366
## 104 High 0.935621948 0.064378052
## 112 High 0.234756885 0.765243115
## 115 High 0.850921886 0.149078114
## 119 High 0.185157545 0.814842455
## 128 High 0.185157545 0.814842455
## 130 High 0.012185144 0.987814856
## 139 High 0.012185144 0.987814856
## 143 High 0.044803695 0.955196305
## 145 High 0.152326220 0.847673780
## 146 High 0.185157545 0.814842455
## 149 High 0.184218497 0.815781503
## 150 High 0.167743600 0.832256400
## 152 High 0.126560560 0.873439440
## 157 High 0.459326567 0.540673433
## 161 High 0.280613265 0.719386735
## 162 High 0.015166466 0.984833534
## 166 High 0.474765084 0.525234916
## 167 High 0.191503675 0.808496325
## 173 High 0.153251009 0.846748991
## 176 High 0.185157545 0.814842455
## 182 High 0.058745459 0.941254541
## 187 High 0.030012739 0.969987261
## 190 High 0.023048545 0.976951455
## 194 High 0.054829148 0.945170852
## 195 High 0.225743072 0.774256928
## 201 High 0.118943078 0.881056922
## 207 High 0.159359817 0.840640183
## 208 High 0.433804137 0.566195863
## 215 High 0.069295112 0.930704888
## 222 High 0.287429665 0.712570335
## 224 High 0.177837300 0.822162700
## 231 High 0.663173031 0.336826969
## 236 High 0.128407370 0.871592630
## 237 High 0.087117926 0.912882074
## 240 High 0.211568207 0.788431793
## 243 High 0.118943078 0.881056922
## 248 High 0.225743072 0.774256928
## 251 High 0.802510963 0.197489037
## 256 High 0.425809264 0.574190736
## 258 High 0.208604234 0.791395766
## 262 High 0.433804137 0.566195863
## 266 High 0.458916481 0.541083519
## 272 High 0.515097736 0.484902264
## 280 High 0.301261733 0.698738267
## 283 High 0.390880675 0.609119325
## 286 High 0.445587091 0.554412909
## 287 High 0.216476703 0.783523297
## 289 High 0.130497736 0.869502264
## 290 High 0.324389101 0.675610899
## 298 High 0.299117365 0.700882635
## 305 High 0.322034003 0.677965997
## 306 High 0.150503800 0.849496200
## 312 High 0.102017831 0.897982169
## 320 High 0.292893182 0.707106818
## 325 High 0.178390162 0.821609838
## 332 High 0.065417688 0.934582312
## 333 High 0.394656156 0.605343844
## 335 High 0.299117365 0.700882635
## 339 High 0.677983251 0.322016749
## 346 High 0.381040169 0.618959831
## 347 High 0.084143549 0.915856451
## 350 High 0.332513268 0.667486732
## 353 High 0.375699877 0.624300123
## 358 High 0.356002112 0.643997888
## 365 High 0.253020718 0.746979282
## 367 High 0.189824624 0.810175376
## 370 High 0.019857926 0.980142074
## 379 High 0.077838519 0.922161481
## 386 High 0.315641390 0.684358610
## 394 High 0.536219674 0.463780326
## 396 High 0.135699295 0.864300705
## 400 High 0.087117926 0.912882074
## 404 High 0.069987179 0.930012821
## 405 High 0.543420114 0.456579886
## 413 High 0.154491627 0.845508373
## 415 High 0.396880018 0.603119982
## 417 High 0.218484683 0.781515317
## 418 High 0.444542179 0.555457821
## 423 High 0.286471132 0.713528868
## 434 High 0.321591910 0.678408090
## 437 High 0.181118453 0.818881547
## 440 High 0.308562671 0.691437329
## 449 High 0.375209738 0.624790262
## 450 High 0.294263105 0.705736895
## 457 High 0.375209738 0.624790262
## 467 High 0.239925904 0.760074096
## 469 High 0.219694251 0.780305749
## 474 High 0.923897112 0.076102888
## 475 High 0.883349944 0.116650056
## 485 High 0.085422969 0.914577031
## 504 Low 0.291557950 0.708442050
## 511 Low 0.648063716 0.351936284
## 512 Low 0.433262622 0.566737378
## 517 Low 0.081862252 0.918137748
## 519 Low 0.768466561 0.231533439
## 520 Low 0.120730626 0.879269374
## 522 Low 0.925451743 0.074548257
## 527 Low 0.645300585 0.354699415
## 528 Low 0.271085584 0.728914416
## 529 Low 0.310908209 0.689091791
## 537 Low 0.160364901 0.839635099
## 540 Low 0.906505047 0.093494953
## 541 Low 0.445392323 0.554607677
## 547 Low 0.908345685 0.091654315
## 550 Low 0.648063716 0.351936284
## 555 Low 0.553749570 0.446250430
## 564 Low 0.076566109 0.923433891
## 570 Low 0.381921892 0.618078108
## 573 Low 0.263325647 0.736674353
## 575 Low 0.445392323 0.554607677
## 578 Low 0.200665198 0.799334802
## 581 Low 0.263325647 0.736674353
## 585 Low 0.328862698 0.671137302
## 590 Low 0.758709382 0.241290618
## 601 Low 0.954499169 0.045500831
## 602 Low 0.642143769 0.357856231
## 607 Low 0.609779376 0.390220624
## 610 Low 0.596902022 0.403097978
## 618 Low 0.770717139 0.229282861
## 624 Low 0.328862698 0.671137302
## 626 Low 0.264394512 0.735605488
## 627 Low 0.325327914 0.674672086
## 634 Low 0.859870031 0.140129969
## 640 Low 0.993333081 0.006666919
## 642 Low 0.183956965 0.816043035
## 643 Low 0.513477586 0.486522414
## 644 Low 0.822581654 0.177418346
## 645 Low 0.716527602 0.283472398
## 646 Low 0.656710655 0.343289345
## 647 Low 0.810727112 0.189272888
## 652 Low 0.207004348 0.792995652
## 658 Low 0.633864017 0.366135983
## 659 Low 0.756657553 0.243342447
## 660 Low 0.915400833 0.084599167
## 664 Low 0.317520700 0.682479300
## 666 Low 0.417931911 0.582068089
## 667 Low 0.877428595 0.122571405
## 675 Low 0.513477586 0.486522414
## 680 Low 0.994852845 0.005147155
## 681 Low 0.919786159 0.080213841
## 687 Low 0.785159445 0.214840555
## 694 Low 0.776291590 0.223708410
## 697 Low 0.504279816 0.495720184
## 701 Low 0.306165429 0.693834571
## 705 Low 0.986170693 0.013829307
## 707 Low 0.683050824 0.316949176
## 710 Low 0.639259875 0.360740125
## 716 Low 0.912237098 0.087762902
## 719 Low 0.933456382 0.066543618
## 720 Low 0.986060028 0.013939972
## 725 Low 0.989191020 0.010808980
## 727 Low 0.306165429 0.693834571
## 730 Low 0.468166324 0.531833676
## 738 Low 0.784293052 0.215706948
## 745 Low 0.580737162 0.419262838
## 748 Low 0.706782117 0.293217883
## 751 Low 0.980575777 0.019424223
## 756 Low 0.717150407 0.282849593
## 766 Low 0.850966796 0.149033204
## 769 Low 0.535328740 0.464671260
## 783 Low 0.811655480 0.188344520
## 785 Low 0.841947001 0.158052999
## 790 Low 0.923371376 0.076628624
## 793 Low 0.811655480 0.188344520
## 795 Low 0.995359452 0.004640548
## 796 Low 0.988726123 0.011273877
## 797 Low 0.613968875 0.386031125
## 801 Low 0.717464025 0.282535975
## 811 Low 0.486145842 0.513854158
## 812 Low 0.961781245 0.038218755
## 815 Low 0.940897618 0.059102382
## 816 Low 0.735876018 0.264123982
## 817 Low 0.926306993 0.073693007
## 824 Low 0.844929335 0.155070665
## 825 Low 0.844929335 0.155070665
## 826 Low 0.844929335 0.155070665
## 830 Low 0.855747468 0.144252532
## 837 Low 0.910197711 0.089802289
## 838 Low 0.735876018 0.264123982
## 844 Low 0.885768710 0.114231290
## 845 Low 0.976017309 0.023982691
## 847 Low 0.924759731 0.075240269
## 850 Low 0.874286786 0.125713214
## 852 Low 0.899365871 0.100634129
## 853 Low 0.899365871 0.100634129
## 861 Low 0.920297779 0.079702221
## 868 Low 0.980571498 0.019428502
## 874 Low 0.937576780 0.062423220
## 879 High 0.152866850 0.847133150
## 895 High 0.039547020 0.960452980
## 899 High 0.006274238 0.993725762
## 903 High 0.039547020 0.960452980
## 917 High 0.029052094 0.970947906
## 927 High 0.059988529 0.940011471
## 929 High 0.153251009 0.846748991
## 931 High 0.059988529 0.940011471
## 933 High 0.383904739 0.616095261
## 944 High 0.056057332 0.943942668
## 947 High 0.086098117 0.913901883
## 949 High 0.191869316 0.808130684
## 953 High 0.021217814 0.978782186
## 958 High 0.451442468 0.548557532
## 961 High 0.024500716 0.975499284
## 963 High 0.158030320 0.841969680
## 964 High 0.118943078 0.881056922
## 973 High 0.054972558 0.945027442
## 976 High 0.076566109 0.923433891
## 977 High 0.225743072 0.774256928
## 980 High 0.194325725 0.805674275
## 983 High 0.564399605 0.435600395
## 984 High 0.225743072 0.774256928
## 986 High 0.102017831 0.897982169
## 989 High 0.215977016 0.784022984
## 991 High 0.028136447 0.971863553
## 996 High 0.096066005 0.903933995
## 997 High 0.409217772 0.590782228
## 999 High 0.050174667 0.949825333
## 1000 High 0.049962301 0.950037699
## 1003 High 0.087117926 0.912882074
## 1008 High 0.110301043 0.889698957
## 1009 High 0.390880675 0.609119325
## 1014 High 0.054342796 0.945657204
## 1015 High 0.507479791 0.492520209
## 1040 High 0.173822737 0.826177263
## 1042 High 0.348406639 0.651593361
## 1043 High 0.664831519 0.335168481
## 1050 High 0.104964385 0.895035615
## 1052 High 0.217627219 0.782372781
## 1056 High 0.430563503 0.569436497
## 1070 High 0.576047259 0.423952741
## 1073 High 0.426680447 0.573319553
## 1074 High 0.199973894 0.800026106
## 1079 High 0.365577914 0.634422086
## 1080 High 0.495056564 0.504943436
## 1085 High 0.114818593 0.885181407
## 1087 High 0.677563755 0.322436245
## 1096 High 0.946509683 0.053490317
## 1099 High 0.476707448 0.523292552
## 1100 High 0.613540731 0.386459269
## 1102 High 0.037272700 0.962727300
## 1107 Low 0.378660320 0.621339680
## 1109 Low 0.746345913 0.253654087
## 1114 Low 0.404291532 0.595708468
## 1118 Low 0.392930424 0.607069576
## 1123 Low 0.449318373 0.550681627
## 1132 Low 0.827326214 0.172673786
## 1134 Low 0.625608459 0.374391541
## 1137 Low 0.325327914 0.674672086
## 1154 Low 0.325327914 0.674672086
## 1155 Low 0.600080216 0.399919784
## 1157 Low 0.848337304 0.151662696
## 1162 Low 0.513477586 0.486522414
## 1164 Low 0.183956965 0.816043035
## 1171 Low 0.980575777 0.019424223
## 1172 Low 0.459326567 0.540673433
## 1175 Low 0.709821865 0.290178135
## 1177 Low 0.578623299 0.421376701
## 1179 Low 0.955896251 0.044103749
## 1183 Low 0.250350289 0.749649711
## 1185 Low 0.971167041 0.028832959
## 1189 Low 0.957922810 0.042077190
## 1211 Low 0.714877986 0.285122014
## 1218 Low 0.994292098 0.005707902
## 1224 Low 0.372416312 0.627583688
## 1225 Low 0.306165429 0.693834571
## 1227 Low 0.950050219 0.049949781
## 1232 Low 0.993656325 0.006343675
## 1235 Low 0.777300182 0.222699818
## 1238 Low 0.805234045 0.194765955
## 1240 Low 0.900348204 0.099651796
## 1241 Low 0.714877986 0.285122014
## 1248 Low 0.855747468 0.144252532
## 1258 Low 0.735876018 0.264123982
## 1261 Low 0.881558385 0.118441615
## 1263 Low 0.844929335 0.155070665
## 1269 Low 0.961633901 0.038366099
## 1270 Low 0.995359452 0.004640548
## 1271 Low 0.874286786 0.125713214
## 1272 Low 0.899365871 0.100634129
## 1280 Low 0.937576780 0.062423220
## 1286 Low 0.983472472 0.016527528
## 1287 Low 0.985133774 0.014866226
## 1289 Low 0.962532737 0.037467263
## 1290 Low 0.899365871 0.100634129
## 1291 High 0.183663572 0.816336428
## 1294 High 0.737553964 0.262446036
## 1305 Low 0.961175729 0.038824271
## 1308 High 0.907626255 0.092373745
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_WT_Test$LR_WT_Observed,
LR_WT_Test_ROC predictor = LR_WT_Test$LR_WT_Predicted.High,
levels = rev(levels(LR_WT_Test$LR_WT_Observed)))
<- auc(LR_WT_Test_ROC)[1]) (LR_WT_Test_ROCCurveAUC
## [1] 0.8891629
##################################
# Creating a local object
# for the train and test sets
##################################
<- Solubility_Train
PMA_PreModelling_Train <- Solubility_Test
PMA_PreModelling_Test
<- PMA_PreModelling_Train
PMA_PreModelling_Train_LR <- PMA_PreModelling_Test
PMA_PreModelling_Test_LR
<- PMA_PreModelling_Train_LR[,sapply(PMA_PreModelling_Train_LR, is.numeric)]
PMA_PreModelling_Train_LR.Numeric <- PMA_PreModelling_Test_LR[,sapply(PMA_PreModelling_Test_LR, is.numeric)]
PMA_PreModelling_Test_LR.Numeric
##################################
# Treating data skewness
# for the train set
##################################
# No actions applied
##################################
# Treating data outliers
# for the train set
##################################
<- preProcess(PMA_PreModelling_Train_LR, method = c("spatialSign"))
Transform_SpatialSign <- predict(Transform_SpatialSign, PMA_PreModelling_Train_LR.Numeric)
PMA_PreModelling_Train_LR_SST $Log_Solubility_Class <- PMA_PreModelling_Train_LR$Log_Solubility_Class
PMA_PreModelling_Train_LR_SST
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Train_LR_SST[,sapply(PMA_PreModelling_Train_LR_SST, is.numeric)],
y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Train Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Train_LR_SST[,sapply(PMA_PreModelling_Train_LR_SST, is.numeric)],
y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Train Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Train_LR_SST$Log_Solubility_Class)))))
##################################
# Creating consistent fold assignments
# for the 10-Fold Cross Validation process
##################################
set.seed(12345678)
<- createFolds(PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
KFold_Indices k = 10,
returnTrain=TRUE)
<- trainControl(method="cv",
KFold_Control index=KFold_Indices,
summaryFunction = twoClassSummary,
classProbs = TRUE)
##################################
# Setting the conditions
# for hyperparameter tuning
##################################
# No hyperparameter tuning process conducted
# hyperparameter=intercept fixed to TRUE
##################################
# Running the logistic regression model
# by setting the caret method to 'glm'
##################################
set.seed(12345678)
<- train(x = PMA_PreModelling_Train_LR_SST[,!names(PMA_PreModelling_Train_LR_SST) %in% c("Log_Solubility_Class")],
LR_SST_Tune y = PMA_PreModelling_Train_LR_SST$Log_Solubility_Class,
method = "glm",
metric = "ROC",
trControl = KFold_Control)
##################################
# Reporting the cross-validation results
# for the train set
##################################
LR_SST_Tune
## Generalized Linear Model
##
## 951 samples
## 4 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 857, 855, 857, 855, 856, 856, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8660886 0.7375415 0.8493469
$finalModel LR_SST_Tune
##
## Call: NULL
##
## Coefficients:
## (Intercept) HydrophilicFactor NumAtoms NumNonHAtoms
## 0.17984 1.38112 0.05357 -1.58797
## NumCarbon
## -1.66022
##
## Degrees of Freedom: 950 Total (i.e. Null); 946 Residual
## Null Deviance: 1308
## Residual Deviance: 868.7 AIC: 878.7
$results LR_SST_Tune
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8660886 0.7375415 0.8493469 0.04170766 0.06066318 0.05810749
<- LR_SST_Tune$results$ROC) (LR_SST_Train_ROCCurveAUC
## [1] 0.8660886
##################################
# Identifying and plotting the
# best model predictors
##################################
<- varImp(LR_SST_Tune, scale = TRUE)
LR_SST_VarImp plot(LR_SST_VarImp,
top=4,
scales=list(y=list(cex = .95)),
main="Ranked Variable Importance : Logistic Regression",
xlab="Scaled Variable Importance Metrics",
ylab="Predictors",
cex=2,
origin=0,
alpha=0.45)
##################################
# Treating data skewness
# for the test set
##################################
# No actions applied
##################################
# Treating data outliers
# for the test set
##################################
<- predict(Transform_SpatialSign, PMA_PreModelling_Test_LR.Numeric)
PMA_PreModelling_Test_LR_SST $Log_Solubility_Class <- PMA_PreModelling_Test_LR$Log_Solubility_Class
PMA_PreModelling_Test_LR_SST
##################################
# Exploring the train set distribution
# of the numeric predictors
# with respect to the outcome
##################################
featurePlot(x = PMA_PreModelling_Test_LR_SST[,sapply(PMA_PreModelling_Test_LR_SST, is.numeric)],
y = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Test Set : Numeric Predictor Distribution by Response Level")
featurePlot(x = PMA_PreModelling_Test_LR_SST[,sapply(PMA_PreModelling_Test_LR_SST, is.numeric)],
y = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
main = "LR_SST Test Set : Numeric Predictor Distribution by Response Level",
auto.key = list(columns = (length(levels(PMA_PreModelling_Test_LR_SST$Log_Solubility_Class)))))
##################################
# Independently evaluating the model
# on the test set
##################################
<- data.frame(LR_SST_Observed = PMA_PreModelling_Test_LR_SST$Log_Solubility_Class,
LR_SST_Test LR_SST_Predicted = predict(LR_SST_Tune,
!names(PMA_PreModelling_Test_LR_SST) %in% c("Log_Solubility_Class")],
PMA_PreModelling_Test_LR_SST[,type = "prob"))
LR_SST_Test
## LR_SST_Observed LR_SST_Predicted.Low LR_SST_Predicted.High
## 20 High 0.09601045 0.90398955
## 21 High 0.07124817 0.92875183
## 23 High 0.08708570 0.91291430
## 25 High 0.06240783 0.93759217
## 28 High 0.08655612 0.91344388
## 31 High 0.13801964 0.86198036
## 32 High 0.16206633 0.83793367
## 33 High 0.16608858 0.83391142
## 34 High 0.16608858 0.83391142
## 37 High 0.26722180 0.73277820
## 38 High 0.26722180 0.73277820
## 42 High 0.88124946 0.11875054
## 49 High 0.18230992 0.81769008
## 54 High 0.09206998 0.90793002
## 55 High 0.08258681 0.91741319
## 58 High 0.24825896 0.75174104
## 60 High 0.13669151 0.86330849
## 61 High 0.16608858 0.83391142
## 65 High 0.10668377 0.89331623
## 69 High 0.30827773 0.69172227
## 73 High 0.11783250 0.88216750
## 86 High 0.15534315 0.84465685
## 90 High 0.09330688 0.90669312
## 91 High 0.09693780 0.90306220
## 93 High 0.09330688 0.90669312
## 96 High 0.09693780 0.90306220
## 98 High 0.10148441 0.89851559
## 100 High 0.12149523 0.87850477
## 104 High 0.85645369 0.14354631
## 112 High 0.21358439 0.78641561
## 115 High 0.88435693 0.11564307
## 119 High 0.17266138 0.82733862
## 128 High 0.17266138 0.82733862
## 130 High 0.08503124 0.91496876
## 139 High 0.08503124 0.91496876
## 143 High 0.05944353 0.94055647
## 145 High 0.18313278 0.81686722
## 146 High 0.17266138 0.82733862
## 149 High 0.13740764 0.86259236
## 150 High 0.18130010 0.81869990
## 152 High 0.16608858 0.83391142
## 157 High 0.26658480 0.73341520
## 161 High 0.17711690 0.82288310
## 162 High 0.07695031 0.92304969
## 166 High 0.41489928 0.58510072
## 167 High 0.19476630 0.80523370
## 173 High 0.17346217 0.82653783
## 176 High 0.17266138 0.82733862
## 182 High 0.14009484 0.85990516
## 187 High 0.09412624 0.90587376
## 190 High 0.08209356 0.91790644
## 194 High 0.13023242 0.86976758
## 195 High 0.16037985 0.83962015
## 201 High 0.09832548 0.90167452
## 207 High 0.12012692 0.87987308
## 208 High 0.34709451 0.65290549
## 215 High 0.13801964 0.86198036
## 222 High 0.20060734 0.79939266
## 224 High 0.09514653 0.90485347
## 231 High 0.66717290 0.33282710
## 236 High 0.12558408 0.87441592
## 237 High 0.15067576 0.84932424
## 240 High 0.14905340 0.85094660
## 243 High 0.09832548 0.90167452
## 248 High 0.16037985 0.83962015
## 251 High 0.83891581 0.16108419
## 256 High 0.48090891 0.51909109
## 258 High 0.14809516 0.85190484
## 262 High 0.34709451 0.65290549
## 266 High 0.27148151 0.72851849
## 272 High 0.42076835 0.57923165
## 280 High 0.37148413 0.62851587
## 283 High 0.33866037 0.66133963
## 286 High 0.25364415 0.74635585
## 287 High 0.15032543 0.84967457
## 289 High 0.14473001 0.85526999
## 290 High 0.36443155 0.63556845
## 298 High 0.24314628 0.75685372
## 305 High 0.17653728 0.82346272
## 306 High 0.24087855 0.75912145
## 312 High 0.09619891 0.90380109
## 320 High 0.25289581 0.74710419
## 325 High 0.13114586 0.86885414
## 332 High 0.05741702 0.94258298
## 333 High 0.33447896 0.66552104
## 335 High 0.24314628 0.75685372
## 339 High 0.76354018 0.23645982
## 346 High 0.37652804 0.62347196
## 347 High 0.13821283 0.86178717
## 350 High 0.32057761 0.67942239
## 353 High 0.26481814 0.73518186
## 358 High 0.36381440 0.63618560
## 365 High 0.15607953 0.84392047
## 367 High 0.12195706 0.87804294
## 370 High 0.09000562 0.90999438
## 379 High 0.14337494 0.85662506
## 386 High 0.27698113 0.72301887
## 394 High 0.74716571 0.25283429
## 396 High 0.08716053 0.91283947
## 400 High 0.15067576 0.84932424
## 404 High 0.13313956 0.86686044
## 405 High 0.50020364 0.49979636
## 413 High 0.10227313 0.89772687
## 415 High 0.24406283 0.75593717
## 417 High 0.22429372 0.77570628
## 418 High 0.66553672 0.33446328
## 423 High 0.23928941 0.76071059
## 434 High 0.24070864 0.75929136
## 437 High 0.23599992 0.76400008
## 440 High 0.43499212 0.56500788
## 449 High 0.54702006 0.45297994
## 450 High 0.15824323 0.84175677
## 457 High 0.54702006 0.45297994
## 467 High 0.32837858 0.67162142
## 469 High 0.18858281 0.81141719
## 474 High 0.87686350 0.12313650
## 475 High 0.87672351 0.12327649
## 485 High 0.28334254 0.71665746
## 504 Low 0.33447964 0.66552036
## 511 Low 0.60427805 0.39572195
## 512 Low 0.27333938 0.72666062
## 517 Low 0.13237259 0.86762741
## 519 Low 0.82383869 0.17616131
## 520 Low 0.16215286 0.83784714
## 522 Low 0.89837761 0.10162239
## 527 Low 0.79771009 0.20228991
## 528 Low 0.25999106 0.74000894
## 529 Low 0.49083183 0.50916817
## 537 Low 0.17724045 0.82275955
## 540 Low 0.89009261 0.10990739
## 541 Low 0.42590525 0.57409475
## 547 Low 0.90735268 0.09264732
## 550 Low 0.54923224 0.45076776
## 555 Low 0.76716356 0.23283644
## 564 Low 0.14548176 0.85451824
## 570 Low 0.61515621 0.38484379
## 573 Low 0.16732141 0.83267859
## 575 Low 0.42590525 0.57409475
## 578 Low 0.21645684 0.78354316
## 581 Low 0.16732141 0.83267859
## 585 Low 0.19579720 0.80420280
## 590 Low 0.89369063 0.10630937
## 601 Low 0.77618942 0.22381058
## 602 Low 0.83477322 0.16522678
## 607 Low 0.85385593 0.14614407
## 610 Low 0.86401175 0.13598825
## 618 Low 0.76648549 0.23351451
## 624 Low 0.19579720 0.80420280
## 626 Low 0.39079903 0.60920097
## 627 Low 0.24191991 0.75808009
## 634 Low 0.60202668 0.39797332
## 640 Low 0.85724614 0.14275386
## 642 Low 0.20979096 0.79020904
## 643 Low 0.56251635 0.43748365
## 644 Low 0.90646801 0.09353199
## 645 Low 0.82298391 0.17701609
## 646 Low 0.69510923 0.30489077
## 647 Low 0.74412785 0.25587215
## 652 Low 0.20093976 0.79906024
## 658 Low 0.78748187 0.21251813
## 659 Low 0.78880883 0.21119117
## 660 Low 0.86232533 0.13767467
## 664 Low 0.44522589 0.55477411
## 666 Low 0.21629092 0.78370908
## 667 Low 0.87244386 0.12755614
## 675 Low 0.56251635 0.43748365
## 680 Low 0.85164563 0.14835437
## 681 Low 0.86714281 0.13285719
## 687 Low 0.86569004 0.13430996
## 694 Low 0.91766599 0.08233401
## 697 Low 0.52480798 0.47519202
## 701 Low 0.31589620 0.68410380
## 705 Low 0.81924861 0.18075139
## 707 Low 0.77057558 0.22942442
## 710 Low 0.59761749 0.40238251
## 716 Low 0.82909104 0.17090896
## 719 Low 0.86938439 0.13061561
## 720 Low 0.85732525 0.14267475
## 725 Low 0.85641884 0.14358116
## 727 Low 0.31589620 0.68410380
## 730 Low 0.26046315 0.73953685
## 738 Low 0.83701684 0.16298316
## 745 Low 0.70271329 0.29728671
## 748 Low 0.79057047 0.20942953
## 751 Low 0.84112961 0.15887039
## 756 Low 0.84777333 0.15222667
## 766 Low 0.83906038 0.16093962
## 769 Low 0.29897447 0.70102553
## 783 Low 0.87595878 0.12404122
## 785 Low 0.89290405 0.10709595
## 790 Low 0.87181803 0.12818197
## 793 Low 0.87595878 0.12404122
## 795 Low 0.85791849 0.14208151
## 796 Low 0.85311015 0.14688985
## 797 Low 0.75584701 0.24415299
## 801 Low 0.53488474 0.46511526
## 811 Low 0.28134812 0.71865188
## 812 Low 0.84937551 0.15062449
## 815 Low 0.84844617 0.15155383
## 816 Low 0.68518843 0.31481157
## 817 Low 0.91238717 0.08761283
## 824 Low 0.89683201 0.10316799
## 825 Low 0.89683201 0.10316799
## 826 Low 0.89683201 0.10316799
## 830 Low 0.89256275 0.10743725
## 837 Low 0.90590054 0.09409946
## 838 Low 0.68518843 0.31481157
## 844 Low 0.90510276 0.09489724
## 845 Low 0.90252118 0.09747882
## 847 Low 0.92129203 0.07870797
## 850 Low 0.90778429 0.09221571
## 852 Low 0.91163676 0.08836324
## 853 Low 0.91163676 0.08836324
## 861 Low 0.91093261 0.08906739
## 868 Low 0.91054857 0.08945143
## 874 Low 0.90756578 0.09243422
## 879 High 0.16682367 0.83317633
## 895 High 0.09101760 0.90898240
## 899 High 0.08837534 0.91162466
## 903 High 0.09101760 0.90898240
## 917 High 0.09619149 0.90380851
## 927 High 0.09206998 0.90793002
## 929 High 0.17346217 0.82653783
## 931 High 0.09206998 0.90793002
## 933 High 0.51744048 0.48255952
## 944 High 0.08037883 0.91962117
## 947 High 0.09330688 0.90669312
## 949 High 0.10386583 0.89613417
## 953 High 0.07173711 0.92826289
## 958 High 0.45201022 0.54798978
## 961 High 0.05848371 0.94151629
## 963 High 0.15241824 0.84758176
## 964 High 0.09832548 0.90167452
## 973 High 0.07223382 0.92776618
## 976 High 0.14548176 0.85451824
## 977 High 0.16037985 0.83962015
## 980 High 0.33393088 0.66606912
## 983 High 0.40332216 0.59667784
## 984 High 0.16037985 0.83962015
## 986 High 0.09619891 0.90380109
## 989 High 0.20478385 0.79521615
## 991 High 0.06136135 0.93863865
## 996 High 0.15817455 0.84182545
## 997 High 0.29133806 0.70866194
## 999 High 0.08708570 0.91291430
## 1000 High 0.08096203 0.91903797
## 1003 High 0.15067576 0.84932424
## 1008 High 0.08653032 0.91346968
## 1009 High 0.33866037 0.66133963
## 1014 High 0.06211801 0.93788199
## 1015 High 0.68254254 0.31745746
## 1040 High 0.10197668 0.89802332
## 1042 High 0.36045604 0.63954396
## 1043 High 0.62172343 0.37827657
## 1050 High 0.07340589 0.92659411
## 1052 High 0.15785081 0.84214919
## 1056 High 0.22781432 0.77218568
## 1070 High 0.82152666 0.17847334
## 1073 High 0.26265879 0.73734121
## 1074 High 0.19955598 0.80044402
## 1079 High 0.33568619 0.66431381
## 1080 High 0.60786098 0.39213902
## 1085 High 0.14911681 0.85088319
## 1087 High 0.89957835 0.10042165
## 1096 High 0.85587597 0.14412403
## 1099 High 0.68096864 0.31903136
## 1100 High 0.69132245 0.30867755
## 1102 High 0.09812815 0.90187185
## 1107 Low 0.33306906 0.66693094
## 1109 Low 0.82831781 0.17168219
## 1114 Low 0.34072119 0.65927881
## 1118 Low 0.60266074 0.39733926
## 1123 Low 0.51544312 0.48455688
## 1132 Low 0.89086611 0.10913389
## 1134 Low 0.86196038 0.13803962
## 1137 Low 0.24191991 0.75808009
## 1154 Low 0.24191991 0.75808009
## 1155 Low 0.82419178 0.17580822
## 1157 Low 0.79195710 0.20804290
## 1162 Low 0.56251635 0.43748365
## 1164 Low 0.20979096 0.79020904
## 1171 Low 0.84112961 0.15887039
## 1172 Low 0.26658480 0.73341520
## 1175 Low 0.81770909 0.18229091
## 1177 Low 0.73170491 0.26829509
## 1179 Low 0.86003399 0.13996601
## 1183 Low 0.25318113 0.74681887
## 1185 Low 0.88391319 0.11608681
## 1189 Low 0.77072878 0.22927122
## 1211 Low 0.80815455 0.19184545
## 1218 Low 0.87884779 0.12115221
## 1224 Low 0.24958622 0.75041378
## 1225 Low 0.31589620 0.68410380
## 1227 Low 0.87910711 0.12089289
## 1232 Low 0.85531787 0.14468213
## 1235 Low 0.90784054 0.09215946
## 1238 Low 0.80846734 0.19153266
## 1240 Low 0.88005225 0.11994775
## 1241 Low 0.80815455 0.19184545
## 1248 Low 0.89256275 0.10743725
## 1258 Low 0.68518843 0.31481157
## 1261 Low 0.86946402 0.13053598
## 1263 Low 0.89683201 0.10316799
## 1269 Low 0.89945602 0.10054398
## 1270 Low 0.84133006 0.15866994
## 1271 Low 0.90778429 0.09221571
## 1272 Low 0.91163676 0.08836324
## 1280 Low 0.90756578 0.09243422
## 1286 Low 0.90492295 0.09507705
## 1287 Low 0.90812642 0.09187358
## 1289 Low 0.89739056 0.10260944
## 1290 Low 0.91163676 0.08836324
## 1291 High 0.09497549 0.90502451
## 1294 High 0.86886342 0.13113658
## 1305 Low 0.86138517 0.13861483
## 1308 High 0.75006281 0.24993719
##################################
# Reporting the independent evaluation results
# for the test set
##################################
<- roc(response = LR_SST_Test$LR_SST_Observed,
LR_SST_Test_ROC predictor = LR_SST_Test$LR_SST_Predicted.High,
levels = rev(levels(LR_SST_Test$LR_SST_Observed)))
<- auc(LR_SST_Test_ROC)[1]) (LR_SST_Test_ROCCurveAUC
## [1] 0.8878694
##################################
# Consolidating all evaluation results
# for the train and test sets
# using the AUROC metric
##################################
<- c('LR_REF','LR_BCT','LR_YJT','LR_ET','LR_IHST','LR_LOG10T','LR_LNT','LR_SRT','LR_WT','LR_SST',
Model 'LR_REF','LR_BCT','LR_YJT','LR_ET','LR_IHST','LR_LOG10T','LR_LNT','LR_SRT','LR_WT','LR_SST')
<- c(rep('Cross-Validation',10),rep('Test',10))
Set
<- c(LR_Train_ROCCurveAUC,LR_BCT_Train_ROCCurveAUC,
ROCCurveAUC
LR_YJT_Train_ROCCurveAUC,LR_ET_Train_ROCCurveAUC,
LR_IHST_Train_ROCCurveAUC,LR_LOG10T_Train_ROCCurveAUC,
LR_LNT_Train_ROCCurveAUC,LR_SRT_Train_ROCCurveAUC,
LR_WT_Train_ROCCurveAUC,LR_SST_Train_ROCCurveAUC,
LR_Test_ROCCurveAUC,LR_BCT_Test_ROCCurveAUC,
LR_YJT_Test_ROCCurveAUC,LR_ET_Test_ROCCurveAUC,
LR_IHST_Test_ROCCurveAUC,LR_LOG10T_Test_ROCCurveAUC,
LR_LNT_Test_ROCCurveAUC,LR_SRT_Test_ROCCurveAUC,
LR_WT_Test_ROCCurveAUC,LR_SST_Test_ROCCurveAUC)
<- as.data.frame(cbind(Model,Set,ROCCurveAUC))
ROCCurveAUC_Summary
$ROCCurveAUC <- as.numeric(as.character(ROCCurveAUC_Summary$ROCCurveAUC))
ROCCurveAUC_Summary$Set <- factor(ROCCurveAUC_Summary$Set,
ROCCurveAUC_Summarylevels = c("Cross-Validation",
"Test"))
$Model <- factor(ROCCurveAUC_Summary$Model,
ROCCurveAUC_Summarylevels = c('LR_REF',
'LR_BCT',
'LR_YJT',
'LR_ET',
'LR_IHST',
'LR_LOG10T',
'LR_LNT',
'LR_SRT',
'LR_WT',
'LR_SST'))
print(ROCCurveAUC_Summary, row.names=FALSE)
## Model Set ROCCurveAUC
## LR_REF Cross-Validation 0.8747542
## LR_BCT Cross-Validation 0.8887838
## LR_YJT Cross-Validation 0.8807066
## LR_ET Cross-Validation 0.8805333
## LR_IHST Cross-Validation 0.8710722
## LR_LOG10T Cross-Validation 0.8919210
## LR_LNT Cross-Validation 0.8919210
## LR_SRT Cross-Validation 0.8844149
## LR_WT Cross-Validation 0.8742874
## LR_SST Cross-Validation 0.8660886
## LR_REF Test 0.8844739
## LR_BCT Test 0.8967622
## LR_YJT Test 0.8906181
## LR_ET Test 0.8901330
## LR_IHST Test 0.8799871
## LR_LOG10T Test 0.8988237
## LR_LNT Test 0.8988237
## LR_SRT Test 0.8946198
## LR_WT Test 0.8891629
## LR_SST Test 0.8878694
<- dotplot(Model ~ ROCCurveAUC,
(ROCCurveAUC_Plot data = ROCCurveAUC_Summary,
groups = Set,
main = "Classification Model Performance Comparison",
ylab = "Model",
xlab = "AUROC",
auto.key = list(adj=1, space="top", columns=2),
type=c("p", "h"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 2))