##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(lattice)
library(dplyr)
library(tidyr)
library(moments)
library(skimr)
library(RANN)
library(pls)
library(corrplot)
library(tidyverse)
library(lares)
library(DMwR2)
library(gridExtra)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(stats)
##################################
# Loading source and
# formulating the train set
##################################
data(solubility)
<- as.data.frame(cbind(solTrainY,solTrainX))
Solubility_Train
##################################
# Selecting only a subset of
# numeric predictors for the train set
##################################
<- Solubility_Train[,c("solTrainY",
Solubility_Train "MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")]
##################################
# Performing a general exploration of the train set
##################################
dim(Solubility_Train)
## [1] 951 6
str(Solubility_Train)
## 'data.frame': 951 obs. of 6 variables:
## $ solTrainY : num -3.97 -3.98 -3.99 -4 -4.06 -4.08 -4.08 -4.1 -4.1 -4.11 ...
## $ MolWeight : num 208 366 206 136 230 ...
## $ NumCarbon : int 14 21 13 10 9 10 17 12 22 14 ...
## $ NumChlorine : int 0 0 0 0 1 2 2 0 0 0 ...
## $ NumHalogen : int 0 0 0 0 1 2 2 0 1 0 ...
## $ NumMultBonds: int 16 13 7 2 6 2 18 1 4 7 ...
summary(Solubility_Train)
## solTrainY MolWeight NumCarbon NumChlorine
## Min. :-11.620 Min. : 46.09 Min. : 1.000 Min. : 0.0000
## 1st Qu.: -3.955 1st Qu.:122.61 1st Qu.: 6.000 1st Qu.: 0.0000
## Median : -2.510 Median :179.23 Median : 9.000 Median : 0.0000
## Mean : -2.719 Mean :201.65 Mean : 9.893 Mean : 0.5563
## 3rd Qu.: -1.360 3rd Qu.:264.34 3rd Qu.:12.000 3rd Qu.: 0.0000
## Max. : 1.580 Max. :665.81 Max. :33.000 Max. :10.0000
## NumHalogen NumMultBonds
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.000
## Median : 0.0000 Median : 6.000
## Mean : 0.6982 Mean : 6.148
## 3rd Qu.: 1.0000 3rd Qu.:10.000
## Max. :10.0000 Max. :25.000
##################################
# Formulating a data type assessment summary
##################################
<- Solubility_Train
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 solTrainY numeric
## 2 2 MolWeight numeric
## 3 3 NumCarbon integer
## 4 4 NumChlorine integer
## 5 5 NumHalogen integer
## 6 6 NumMultBonds integer
##################################
# Loading dataset
##################################
<- Solubility_Train
DQA
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 solTrainY numeric 951 0 1.000
## 2 2 MolWeight numeric 951 0 1.000
## 3 3 NumCarbon integer 951 0 1.000
## 4 4 NumChlorine integer 951 0 1.000
## 5 5 NumHalogen integer 951 0 1.000
## 6 6 NumMultBonds integer 951 0 1.000
##################################
# Listing all predictors
##################################
<- DQA[,!names(DQA) %in% c("solTrainY")]
DQA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 5 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 MolWeight numeric 646 0.679 102.200
## 2 NumCarbon integer 28 0.029 6.000
## 3 NumChlorine integer 11 0.012 0.000
## 4 NumHalogen integer 11 0.012 0.000
## 5 NumMultBonds integer 25 0.026 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 116.230 16 14 1.143
## 2 7.000 105 97 1.082
## 3 1.000 750 81 9.259
## 4 1.000 685 107 6.402
## 5 7.000 158 122 1.295
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 46.090 201.654 179.230 665.810 0.988 3.945 122.605
## 2 1.000 9.893 9.000 33.000 0.927 3.616 6.000
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000
## 4 0.000 0.698 0.000 10.000 2.691 10.808 0.000
## 5 0.000 6.148 6.000 25.000 0.670 3.053 1.000
## Percentile75th
## 1 264.340
## 2 12.000
## 3 0.000
## 4 1.000
## 5 10.000
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 NumChlorine integer 11 0.012 0.000
## 4 NumHalogen integer 11 0.012 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 1.000 750 81 9.259
## 4 1.000 685 107 6.402
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000 0.000
## 4 0.000 0.698 0.000 10.000 2.691 10.808 0.000 1.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 1 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 NumChlorine integer 11 0.012 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 1.000 750 81 9.259
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 0.000 0.556 0.000 10.000 3.178 13.780 0.000 0.000
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("solTrainY")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA.Predictors.Numeric[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "5 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 951 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
MolWeight | 0 | 1 | 201.65 | 97.91 | 46.09 | 122.6 | 179.23 | 264.34 | 665.81 | ▇▆▂▁▁ |
NumCarbon | 0 | 1 | 9.89 | 5.29 | 1.00 | 6.0 | 9.00 | 12.00 | 33.00 | ▇▇▃▁▁ |
NumChlorine | 0 | 1 | 0.56 | 1.40 | 0.00 | 0.0 | 0.00 | 0.00 | 10.00 | ▇▁▁▁▁ |
NumHalogen | 0 | 1 | 0.70 | 1.47 | 0.00 | 0.0 | 0.00 | 1.00 | 10.00 | ▇▁▁▁▁ |
NumMultBonds | 0 | 1 | 6.15 | 5.17 | 0.00 | 1.0 | 6.00 | 10.00 | 25.00 | ▇▆▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric)
## [1] 951 5
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA)) (DPA_Skimmed
Name | DPA |
Number of rows | 951 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
solTrainY | 0 | 1 | -2.72 | 2.05 | -11.62 | -3.96 | -2.51 | -1.36 | 1.58 | ▁▁▃▇▃ |
MolWeight | 0 | 1 | 201.65 | 97.91 | 46.09 | 122.60 | 179.23 | 264.34 | 665.81 | ▇▆▂▁▁ |
NumCarbon | 0 | 1 | 9.89 | 5.29 | 1.00 | 6.00 | 9.00 | 12.00 | 33.00 | ▇▇▃▁▁ |
NumChlorine | 0 | 1 | 0.56 | 1.40 | 0.00 | 0.00 | 0.00 | 0.00 | 10.00 | ▇▁▁▁▁ |
NumHalogen | 0 | 1 | 0.70 | 1.47 | 0.00 | 0.00 | 0.00 | 1.00 | 10.00 | ▇▁▁▁▁ |
NumMultBonds | 0 | 1 | 6.15 | 5.17 | 0.00 | 1.00 | 6.00 | 10.00 | 25.00 | ▇▆▃▁▁ |
##################################
# Identifying columns with low variance
###################################
<- nearZeroVar(DPA,
DPA_LowVariance freqCut = 95/5,
uniqueCut = 10,
saveMetrics= TRUE)
$nzv,]) (DPA_LowVariance[DPA_LowVariance
## [1] freqRatio percentUnique zeroVar nzv
## <0 rows> (or 0-length row.names)
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
else {
}
print(paste0("Low variance observed for ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
<- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
DPA_LowVarianceForRemoval
print(paste0("Low variance can be resolved by removing ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
<- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
DPA_LowVarianceRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LowVarianceRemovedVariable))
}
%>%
DPA skim() %>%
::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
dplyr
##################################
# Filtering out columns with low variance
#################################
<- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLowVariance))
(DPA_ExcludedLowVariance_Skimmed }
## [1] "No low variance predictors noted."
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("solTrainY")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Visualizing pairwise correlation between predictors
##################################
<- cor.mtest(DPA.Predictors.Numeric,
DPA_CorrelationTest method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")
##################################
# Identifying the highly correlated variables
##################################
<- cor(DPA.Predictors.Numeric,
DPA_Correlation method = "pearson",
use="pairwise.complete.obs")
<- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95)) (DPA_HighlyCorrelatedCount
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
else {
} print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount)," pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
<- corr_cross(DPA.Predictors.Numeric,
(DPA_HighlyCorrelatedPairs max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
<- findCorrelation(DPA_Correlation, cutoff = 0.95)
DPA_HighlyCorrelated
<- length(DPA_HighlyCorrelated))
(DPA_HighlyCorrelatedForRemoval
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval)," numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
DPA_HighlyCorrelatedRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
<- DPA[,-DPA_HighlyCorrelated]
DPA_ExcludedHighCorrelation
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedHighCorrelation))
(DPA_ExcludedHighCorrelation_Skimmed
}
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("solTrainY")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying the linearly dependent variables
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$linearCombos)) (DPA_LinearlyDependentCount
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
else {
} print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount)," subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
DPA_LinearlyDependentSubset print(paste0("Linear dependent variable(s) for subset ",
i," include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$remove)
DPA_LinearlyDependentForRemoval
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval)," numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
DPA_LinearlyDependentRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
<- DPA[,-DPA_LinearlyDependent$remove]
DPA_ExcludedLinearlyDependent
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLinearlyDependent))
(DPA_ExcludedLinearlyDependent_Skimmed
}
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("solTrainY")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Applying a Box-Cox transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCox <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
DPA_BoxCoxTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_BoxCoxTransformed)) (DPA_BoxCoxTransformedSkimmed
Name | DPA_BoxCoxTransformed |
Number of rows | 951 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
MolWeight | 0 | 1 | 5.19 | 0.48 | 3.83 | 4.81 | 5.19 | 5.58 | 6.50 | ▁▆▇▆▁ |
NumCarbon | 0 | 1 | 3.54 | 1.34 | 0.00 | 2.62 | 3.52 | 4.25 | 7.62 | ▂▇▇▃▁ |
NumChlorine | 0 | 1 | 0.56 | 1.40 | 0.00 | 0.00 | 0.00 | 0.00 | 10.00 | ▇▁▁▁▁ |
NumHalogen | 0 | 1 | 0.70 | 1.47 | 0.00 | 0.00 | 0.00 | 1.00 | 10.00 | ▇▁▁▁▁ |
NumMultBonds | 0 | 1 | 6.15 | 5.17 | 0.00 | 1.00 | 6.00 | 10.00 | 25.00 | ▇▆▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA_BoxCoxTransformed)
## [1] 951 5
##################################
# Loading dataset
##################################
<- Solubility_Train
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("solTrainY")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Applying a Box-Cox transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCox <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
DPA_BoxCoxTransformed
##################################
# Applying a center and scale data transformation
##################################
<- preProcess(DPA_BoxCoxTransformed, method = c("center","scale"))
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled <- predict(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaled, DPA_BoxCoxTransformed)
DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)) (DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformedSkimmed
Name | DPA.Predictors.Numeric_Bo… |
Number of rows | 951 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
MolWeight | 0 | 1 | 0 | 1 | -2.84 | -0.80 | -0.01 | 0.80 | 2.72 | ▁▆▇▆▁ |
NumCarbon | 0 | 1 | 0 | 1 | -2.64 | -0.69 | -0.01 | 0.54 | 3.06 | ▂▇▇▃▁ |
NumChlorine | 0 | 1 | 0 | 1 | -0.40 | -0.40 | -0.40 | -0.40 | 6.74 | ▇▁▁▁▁ |
NumHalogen | 0 | 1 | 0 | 1 | -0.47 | -0.47 | -0.47 | 0.20 | 6.32 | ▇▁▁▁▁ |
NumMultBonds | 0 | 1 | 0 | 1 | -1.19 | -1.00 | -0.03 | 0.74 | 3.65 | ▇▇▃▁▁ |
###################################
# Verifying the data dimensions
###################################
dim(DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed)
## [1] 951 5
##################################
# Creating the pre-modelling
# train set
##################################
<- DPA$solTrainY
Log_Solubility <- DPA.Predictors.Numeric_BoxCoxTransformed_CenteredScaledTransformed
PMA.Predictors.Numeric <- cbind(Log_Solubility,PMA.Predictors.Numeric)
PMA_BoxCoxTransformed_CenteredScaledTransformed <- PMA_BoxCoxTransformed_CenteredScaledTransformed
PMA_PreModelling_Train
##################################
# Gathering descriptive statistics
##################################
<- skim(PMA_PreModelling_Train)) (PMA_PreModelling_Train_Skimmed
Name | PMA_PreModelling_Train |
Number of rows | 951 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Log_Solubility | 0 | 1 | -2.72 | 2.05 | -11.62 | -3.96 | -2.51 | -1.36 | 1.58 | ▁▁▃▇▃ |
MolWeight | 0 | 1 | 0.00 | 1.00 | -2.84 | -0.80 | -0.01 | 0.80 | 2.72 | ▁▆▇▆▁ |
NumCarbon | 0 | 1 | 0.00 | 1.00 | -2.64 | -0.69 | -0.01 | 0.54 | 3.06 | ▂▇▇▃▁ |
NumChlorine | 0 | 1 | 0.00 | 1.00 | -0.40 | -0.40 | -0.40 | -0.40 | 6.74 | ▇▁▁▁▁ |
NumHalogen | 0 | 1 | 0.00 | 1.00 | -0.47 | -0.47 | -0.47 | 0.20 | 6.32 | ▇▁▁▁▁ |
NumMultBonds | 0 | 1 | 0.00 | 1.00 | -1.19 | -1.00 | -0.03 | 0.74 | 3.65 | ▇▇▃▁▁ |
###################################
# Verifying the data dimensions
# for the train set
###################################
dim(PMA_PreModelling_Train)
## [1] 951 6
##################################
# Loading dataset
##################################
<- PMA_PreModelling_Train
EDA
##################################
# Listing all predictors
##################################
<- EDA[,!names(EDA) %in% c("Log_Solubility")]
EDA.Predictors
##################################
# Listing all numeric predictors
##################################
<- EDA.Predictors[,sapply(EDA.Predictors, is.numeric)]
EDA.Predictors.Numeric ncol(EDA.Predictors.Numeric)
## [1] 5
names(EDA.Predictors.Numeric)
## [1] "MolWeight" "NumCarbon" "NumChlorine" "NumHalogen" "NumMultBonds"
##################################
# Formulating the scatter plots
##################################
featurePlot(x = EDA.Predictors.Numeric,
y = EDA$Log_Solubility,
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"),
labels = rep("", 2))
##################################
# Defining a function to implement
# normal equations for estimating
# linear regression coefficients
##################################
<- function(y, X){
NormalEquations_LREstimation = data.frame(rep(1,length(y)),X)
X = as.matrix(X)
X = solve(t(X)%*%X)%*%t(X)%*%y
LRcoefficients return(LRcoefficients)
}
##################################
# Loading dataset
# and restructuring to the
# y and X components
##################################
<- PMA_PreModelling_Train$Log_Solubility
y <- PMA_PreModelling_Train$MolWeight
x1_MolWeight <- PMA_PreModelling_Train$NumCarbon
x2_NumCarbon <- PMA_PreModelling_Train$NumChlorine
x3_NumChlorine <- PMA_PreModelling_Train$NumHalogen
x4_NumHalogen <- PMA_PreModelling_Train$NumMultBonds
x5_NumMultBonds = data.frame(x1_MolWeight,
X
x2_NumCarbon,
x3_NumChlorine,
x4_NumHalogen,
x5_NumMultBonds)
##################################
# Estimating the linear regression coefficients
# using the normal equations algorithm
##################################
<- NormalEquations_LREstimation(y = y,
LR_NE X = X)
##################################
# Consolidating all estimated
# linear regression coefficients
# using the normal equations algorithm
##################################
<- as.data.frame(LR_NE)
LR_NE rownames(LR_NE) <- NULL
colnames(LR_NE) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_NE"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_NE",nrow(LR_NE))
LR_NE
##################################
# Summarizing the estimated
# linear regression coefficients
# using the normal equations algorithm
##################################
print(LR_NE)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_NE
## 2 0.2049318 MolWeight LR_NE
## 3 -1.2542520 NumCarbon LR_NE
## 4 -0.1441934 NumChlorine LR_NE
## 5 -1.0135099 NumHalogen LR_NE
## 6 -0.3304828 NumMultBonds LR_NE
##################################
# Defining a function to implement
# gradient descent algorithm for estimating
# linear regression coefficients
##################################
<-function(y, X, GradientNormMinimumThreshold, LearningRate, Epochs){
GradientDescent_LREstimation = 0.0001
GradientNormMinimumThreshold = as.matrix(data.frame(rep(1,length(y)),X))
X = dim(X)[1]
Nprint("Initializing Gradient Descent Algorithm Parameters.")
set.seed(12345678)
= as.matrix(rnorm(n=dim(X)[2], mean=0,sd = 1))
Theta.InitialValue = t(Theta.InitialValue)
Theta.InitialValue = t(y) - Theta.InitialValue%*%t(X)
e = -(2/N)%*%(e)%*%X
Gradient.InitialValue = Theta.InitialValue - LearningRate *(1/N)*Gradient.InitialValue
Theta = c()
L2Loss for(i in 1:Epochs){
= c(L2Loss,sqrt(sum((t(y) - Theta%*%t(X))^2)))
L2Loss = t(y) - Theta%*%t(X)
e = -(2/N)%*%e%*%X
grad = Theta - LearningRate*(2/N)*grad
Theta if(sqrt(sum(grad^2)) <= GradientNormMinimumThreshold){
break
}
}if (i < Epochs) {
print("Gradient Descent Algorithm Converged.")
}if (i == Epochs) {
print("Gradient Descent Algorithm Reached Last Epoch Without Convergence.")
print("Minimum Threshold for Gradient Norm = 0.0001 Not Achieved.")
}print(paste("Final Gradient Norm Determined as ",sqrt(sum(grad^2)),"at Epoch",i))
<- list("LRCoefficients" = t(Theta), "L2Loss" = L2Loss)
GradientDescentAlgorithmValues return(GradientDescentAlgorithmValues)
}
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_VHLR_LEC X = X,
LearningRate = 200,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.42670603153189 at Epoch 10"
<- LR_GDA_VHLR_LEC
LR_GDA_VHLR_LEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
<- as.data.frame(LR_GDA_VHLR_LEC_Summary$LRCoefficients)
LR_GDA_VHLR_LEC rownames(LR_GDA_VHLR_LEC) <- NULL
colnames(LR_GDA_VHLR_LEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_VHLR_LEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_VHLR_LEC",nrow(LR_GDA_VHLR_LEC))
LR_GDA_VHLR_LEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
print(LR_GDA_VHLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_GDA_VHLR_LEC
## 2 0.2684003 MolWeight LR_GDA_VHLR_LEC
## 3 -1.4126207 NumCarbon LR_GDA_VHLR_LEC
## 4 -0.6009731 NumChlorine LR_GDA_VHLR_LEC
## 5 -0.6815776 NumHalogen LR_GDA_VHLR_LEC
## 6 -0.3415085 NumMultBonds LR_GDA_VHLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
$Epoch <- 1:length(LR_GDA_VHLR_LEC_Summary$L2Loss)
LR_GDA_VHLR_LEC_Summary$Method <- rep("LR_GDA_VHLR_LEC",length(LR_GDA_VHLR_LEC_Summary$L2Loss))
LR_GDA_VHLR_LEC_Summary
<- LR_GDA_VHLR_LEC_Summary$L2Loss
L2Loss <- LR_GDA_VHLR_LEC_Summary$Epoch
Epoch <- LR_GDA_VHLR_LEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_VHLR_LEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "82.9563953049771" "1" "LR_GDA_VHLR_LEC"
## [2,] "43.3675127878632" "2" "LR_GDA_VHLR_LEC"
## [3,] "40.1055331895743" "3" "LR_GDA_VHLR_LEC"
## [4,] "38.9740740659105" "4" "LR_GDA_VHLR_LEC"
## [5,] "38.2468234561932" "5" "LR_GDA_VHLR_LEC"
## [6,] "37.7326021491187" "6" "LR_GDA_VHLR_LEC"
## [7,] "37.3604396276862" "7" "LR_GDA_VHLR_LEC"
## [8,] "37.0910709007472" "8" "LR_GDA_VHLR_LEC"
## [9,] "36.8989019283906" "9" "LR_GDA_VHLR_LEC"
## [10,] "36.7660015463775" "10" "LR_GDA_VHLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_VHLR_LEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_VHLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_VHLR_HEC X = X,
LearningRate = 200,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 11.8477388521046 at Epoch 50"
<- LR_GDA_VHLR_HEC
LR_GDA_VHLR_HEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
<- as.data.frame(LR_GDA_VHLR_HEC_Summary$LRCoefficients)
LR_GDA_VHLR_HEC rownames(LR_GDA_VHLR_HEC) <- NULL
colnames(LR_GDA_VHLR_HEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_VHLR_HEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_VHLR_HEC",nrow(LR_GDA_VHLR_HEC))
LR_GDA_VHLR_HEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
print(LR_GDA_VHLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.718570 Intercept LR_GDA_VHLR_HEC
## 2 -1.269429 MolWeight LR_GDA_VHLR_HEC
## 3 -2.369198 NumCarbon LR_GDA_VHLR_HEC
## 4 -1.218151 NumChlorine LR_GDA_VHLR_HEC
## 5 -2.002425 NumHalogen LR_GDA_VHLR_HEC
## 6 -1.427259 NumMultBonds LR_GDA_VHLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
$Epoch <- 1:length(LR_GDA_VHLR_HEC_Summary$L2Loss)
LR_GDA_VHLR_HEC_Summary$Method <- rep("LR_GDA_VHLR_HEC",length(LR_GDA_VHLR_HEC_Summary$L2Loss))
LR_GDA_VHLR_HEC_Summary
<- LR_GDA_VHLR_HEC_Summary$L2Loss
L2Loss <- LR_GDA_VHLR_HEC_Summary$Epoch
Epoch <- LR_GDA_VHLR_HEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_VHLR_HEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "82.9563953049771" "1" "LR_GDA_VHLR_HEC"
## [2,] "43.3675127878632" "2" "LR_GDA_VHLR_HEC"
## [3,] "40.1055331895743" "3" "LR_GDA_VHLR_HEC"
## [4,] "38.9740740659105" "4" "LR_GDA_VHLR_HEC"
## [5,] "38.2468234561932" "5" "LR_GDA_VHLR_HEC"
## [6,] "37.7326021491187" "6" "LR_GDA_VHLR_HEC"
## [7,] "37.3604396276862" "7" "LR_GDA_VHLR_HEC"
## [8,] "37.0910709007472" "8" "LR_GDA_VHLR_HEC"
## [9,] "36.8989019283906" "9" "LR_GDA_VHLR_HEC"
## [10,] "36.7660015463775" "10" "LR_GDA_VHLR_HEC"
## [11,] "36.6795350594859" "11" "LR_GDA_VHLR_HEC"
## [12,] "36.6303695815112" "12" "LR_GDA_VHLR_HEC"
## [13,] "36.612173832238" "13" "LR_GDA_VHLR_HEC"
## [14,] "36.620787212362" "14" "LR_GDA_VHLR_HEC"
## [15,] "36.6537669251668" "15" "LR_GDA_VHLR_HEC"
## [16,] "36.7100652578966" "16" "LR_GDA_VHLR_HEC"
## [17,] "36.789806072341" "17" "LR_GDA_VHLR_HEC"
## [18,] "36.8941382329563" "18" "LR_GDA_VHLR_HEC"
## [19,] "37.0251492823555" "19" "LR_GDA_VHLR_HEC"
## [20,] "37.1858266763374" "20" "LR_GDA_VHLR_HEC"
## [21,] "37.380056820877" "21" "LR_GDA_VHLR_HEC"
## [22,] "37.6126542379826" "22" "LR_GDA_VHLR_HEC"
## [23,] "37.8894145699319" "23" "LR_GDA_VHLR_HEC"
## [24,] "38.2171859285369" "24" "LR_GDA_VHLR_HEC"
## [25,] "38.60395341702" "25" "LR_GDA_VHLR_HEC"
## [26,] "39.0589316131335" "26" "LR_GDA_VHLR_HEC"
## [27,] "39.5926595405127" "27" "LR_GDA_VHLR_HEC"
## [28,] "40.2170923405873" "28" "LR_GDA_VHLR_HEC"
## [29,] "40.9456836962362" "29" "LR_GDA_VHLR_HEC"
## [30,] "41.7934532850637" "30" "LR_GDA_VHLR_HEC"
## [31,] "42.7770343876188" "31" "LR_GDA_VHLR_HEC"
## [32,] "43.914698424415" "32" "LR_GDA_VHLR_HEC"
## [33,] "45.2263557053649" "33" "LR_GDA_VHLR_HEC"
## [34,] "46.7335349200765" "34" "LR_GDA_VHLR_HEC"
## [35,] "48.4593475286404" "35" "LR_GDA_VHLR_HEC"
## [36,] "50.4284466828075" "36" "LR_GDA_VHLR_HEC"
## [37,] "52.6669929716488" "37" "LR_GDA_VHLR_HEC"
## [38,] "55.2026405645276" "38" "LR_GDA_VHLR_HEC"
## [39,] "58.0645568817383" "39" "LR_GDA_VHLR_HEC"
## [40,] "61.2834867907404" "40" "LR_GDA_VHLR_HEC"
## [41,] "64.8918689145464" "41" "LR_GDA_VHLR_HEC"
## [42,] "68.9240076285283" "42" "LR_GDA_VHLR_HEC"
## [43,] "73.4163004691077" "43" "LR_GDA_VHLR_HEC"
## [44,] "78.4075176156333" "44" "LR_GDA_VHLR_HEC"
## [45,] "83.9391282093983" "45" "LR_GDA_VHLR_HEC"
## [46,] "90.0556676194485" "46" "LR_GDA_VHLR_HEC"
## [47,] "96.8051401880153" "47" "LR_GDA_VHLR_HEC"
## [48,] "104.239453181256" "48" "LR_GDA_VHLR_HEC"
## [49,] "112.414879289677" "49" "LR_GDA_VHLR_HEC"
## [50,] "121.392546766804" "50" "LR_GDA_VHLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with very high learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_VHLR_HEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_VHLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_HLR_LEC X = X,
LearningRate = 100,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.327225250712014 at Epoch 10"
<- LR_GDA_HLR_LEC
LR_GDA_HLR_LEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
<- as.data.frame(LR_GDA_HLR_LEC_Summary$LRCoefficients)
LR_GDA_HLR_LEC rownames(LR_GDA_HLR_LEC) <- NULL
colnames(LR_GDA_HLR_LEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_HLR_LEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_HLR_LEC",nrow(LR_GDA_HLR_LEC))
LR_GDA_HLR_LEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and low epoch count
##################################
print(LR_GDA_HLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7055537 Intercept LR_GDA_HLR_LEC
## 2 0.6047579 MolWeight LR_GDA_HLR_LEC
## 3 -1.6363423 NumCarbon LR_GDA_HLR_LEC
## 4 -0.8489932 NumChlorine LR_GDA_HLR_LEC
## 5 -0.4926584 NumHalogen LR_GDA_HLR_LEC
## 6 -0.2958283 NumMultBonds LR_GDA_HLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and low epoch count
##################################
$Epoch <- 1:length(LR_GDA_HLR_LEC_Summary$L2Loss)
LR_GDA_HLR_LEC_Summary$Method <- rep("LR_GDA_HLR_LEC",length(LR_GDA_HLR_LEC_Summary$L2Loss))
LR_GDA_HLR_LEC_Summary
<- LR_GDA_HLR_LEC_Summary$L2Loss
L2Loss <- LR_GDA_HLR_LEC_Summary$Epoch
Epoch <- LR_GDA_HLR_LEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_HLR_LEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "112.129202156293" "1" "LR_GDA_HLR_LEC"
## [2,] "69.9906053294776" "2" "LR_GDA_HLR_LEC"
## [3,] "52.4658187984899" "3" "LR_GDA_HLR_LEC"
## [4,] "44.8755235067431" "4" "LR_GDA_HLR_LEC"
## [5,] "41.6408804372354" "5" "LR_GDA_HLR_LEC"
## [6,] "40.1564841713508" "6" "LR_GDA_HLR_LEC"
## [7,] "39.3527407791219" "7" "LR_GDA_HLR_LEC"
## [8,] "38.8275066395866" "8" "LR_GDA_HLR_LEC"
## [9,] "38.4326912660602" "9" "LR_GDA_HLR_LEC"
## [10,] "38.1117947361666" "10" "LR_GDA_HLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_HLR_LEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_HLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_HLR_HEC X = X,
LearningRate = 100,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.0300496286624375 at Epoch 50"
<- LR_GDA_HLR_HEC
LR_GDA_HLR_HEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
<- as.data.frame(LR_GDA_HLR_HEC_Summary$LRCoefficients)
LR_GDA_HLR_HEC rownames(LR_GDA_HLR_HEC) <- NULL
colnames(LR_GDA_HLR_HEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_HLR_HEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_HLR_HEC",nrow(LR_GDA_HLR_HEC))
LR_GDA_HLR_HEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with high learning rate and high epoch count
##################################
print(LR_GDA_HLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_GDA_HLR_HEC
## 2 0.1698594 MolWeight LR_GDA_HLR_HEC
## 3 -1.2219687 NumCarbon LR_GDA_HLR_HEC
## 4 -0.2977972 NumChlorine LR_GDA_HLR_HEC
## 5 -0.8462708 NumHalogen LR_GDA_HLR_HEC
## 6 -0.3310979 NumMultBonds LR_GDA_HLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and high epoch count
##################################
$Epoch <- 1:length(LR_GDA_HLR_HEC_Summary$L2Loss)
LR_GDA_HLR_HEC_Summary$Method <- rep("LR_GDA_HLR_HEC",length(LR_GDA_HLR_HEC_Summary$L2Loss))
LR_GDA_HLR_HEC_Summary
<- LR_GDA_HLR_HEC_Summary$L2Loss
L2Loss <- LR_GDA_HLR_HEC_Summary$Epoch
Epoch <- LR_GDA_HLR_HEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_HLR_HEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "112.129202156293" "1" "LR_GDA_HLR_HEC"
## [2,] "69.9906053294776" "2" "LR_GDA_HLR_HEC"
## [3,] "52.4658187984899" "3" "LR_GDA_HLR_HEC"
## [4,] "44.8755235067431" "4" "LR_GDA_HLR_HEC"
## [5,] "41.6408804372354" "5" "LR_GDA_HLR_HEC"
## [6,] "40.1564841713508" "6" "LR_GDA_HLR_HEC"
## [7,] "39.3527407791219" "7" "LR_GDA_HLR_HEC"
## [8,] "38.8275066395866" "8" "LR_GDA_HLR_HEC"
## [9,] "38.4326912660602" "9" "LR_GDA_HLR_HEC"
## [10,] "38.1117947361666" "10" "LR_GDA_HLR_HEC"
## [11,] "37.8410814575013" "11" "LR_GDA_HLR_HEC"
## [12,] "37.6088433323443" "12" "LR_GDA_HLR_HEC"
## [13,] "37.4080884025972" "13" "LR_GDA_HLR_HEC"
## [14,] "37.2339090197462" "14" "LR_GDA_HLR_HEC"
## [15,] "37.0824885446874" "15" "LR_GDA_HLR_HEC"
## [16,] "36.9506925242453" "16" "LR_GDA_HLR_HEC"
## [17,] "36.8358764051491" "17" "LR_GDA_HLR_HEC"
## [18,] "36.7357790012313" "18" "LR_GDA_HLR_HEC"
## [19,] "36.6484536165958" "19" "LR_GDA_HLR_HEC"
## [20,] "36.5722180985614" "20" "LR_GDA_HLR_HEC"
## [21,] "36.5056158801148" "21" "LR_GDA_HLR_HEC"
## [22,] "36.447384251053" "22" "LR_GDA_HLR_HEC"
## [23,] "36.3964278466087" "23" "LR_GDA_HLR_HEC"
## [24,] "36.3517961489294" "24" "LR_GDA_HLR_HEC"
## [25,] "36.3126642087895" "25" "LR_GDA_HLR_HEC"
## [26,] "36.2783160262168" "26" "LR_GDA_HLR_HEC"
## [27,] "36.2481301692672" "27" "LR_GDA_HLR_HEC"
## [28,] "36.2215673013761" "28" "LR_GDA_HLR_HEC"
## [29,] "36.1981593502996" "29" "LR_GDA_HLR_HEC"
## [30,] "36.177500096855" "30" "LR_GDA_HLR_HEC"
## [31,] "36.1592369958326" "31" "LR_GDA_HLR_HEC"
## [32,] "36.1430640683384" "32" "LR_GDA_HLR_HEC"
## [33,] "36.1287157267037" "33" "LR_GDA_HLR_HEC"
## [34,] "36.1159614113633" "34" "LR_GDA_HLR_HEC"
## [35,] "36.1046009346445" "35" "LR_GDA_HLR_HEC"
## [36,] "36.0944604398" "36" "LR_GDA_HLR_HEC"
## [37,] "36.0853888952591" "37" "LR_GDA_HLR_HEC"
## [38,] "36.0772550542315" "38" "LR_GDA_HLR_HEC"
## [39,] "36.0699448186983" "39" "LR_GDA_HLR_HEC"
## [40,] "36.0633589546223" "40" "LR_GDA_HLR_HEC"
## [41,] "36.057411112046" "41" "LR_GDA_HLR_HEC"
## [42,] "36.0520261097326" "42" "LR_GDA_HLR_HEC"
## [43,] "36.0471384492471" "43" "LR_GDA_HLR_HEC"
## [44,] "36.0426910279566" "44" "LR_GDA_HLR_HEC"
## [45,] "36.0386340244292" "45" "LR_GDA_HLR_HEC"
## [46,] "36.0349239332061" "46" "LR_GDA_HLR_HEC"
## [47,] "36.0315227289593" "47" "LR_GDA_HLR_HEC"
## [48,] "36.028397142702" "48" "LR_GDA_HLR_HEC"
## [49,] "36.0255180350174" "49" "LR_GDA_HLR_HEC"
## [50,] "36.022859853282" "50" "LR_GDA_HLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with high learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_HLR_HEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_HLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_LLR_LEC X = X,
LearningRate = 50,
Epochs = 10)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.962210610545313 at Epoch 10"
<- LR_GDA_LLR_LEC
LR_GDA_LLR_LEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
<- as.data.frame(LR_GDA_LLR_LEC_Summary$LRCoefficients)
LR_GDA_LLR_LEC rownames(LR_GDA_LLR_LEC) <- NULL
colnames(LR_GDA_LLR_LEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_LLR_LEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_LLR_LEC",nrow(LR_GDA_LLR_LEC))
LR_GDA_LLR_LEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and low epoch count
##################################
print(LR_GDA_LLR_LEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.3922400 Intercept LR_GDA_LLR_LEC
## 2 0.8939686 MolWeight LR_GDA_LLR_LEC
## 3 -1.8435282 NumCarbon LR_GDA_LLR_LEC
## 4 -1.0691289 NumChlorine LR_GDA_LLR_LEC
## 5 -0.3806898 NumHalogen LR_GDA_LLR_LEC
## 6 -0.3634163 NumMultBonds LR_GDA_LLR_LEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and low epoch count
##################################
$Epoch <- 1:length(LR_GDA_LLR_LEC_Summary$L2Loss)
LR_GDA_LLR_LEC_Summary$Method <- rep("LR_GDA_LLR_LEC",length(LR_GDA_LLR_LEC_Summary$L2Loss))
LR_GDA_LLR_LEC_Summary
<- LR_GDA_LLR_LEC_Summary$L2Loss
L2Loss <- LR_GDA_LLR_LEC_Summary$Epoch
Epoch <- LR_GDA_LLR_LEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_LLR_LEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "128.868015504947" "1" "LR_GDA_LLR_LEC"
## [2,] "100.674236221077" "2" "LR_GDA_LLR_LEC"
## [3,] "81.8154777197929" "3" "LR_GDA_LLR_LEC"
## [4,] "68.7468688791543" "4" "LR_GDA_LLR_LEC"
## [5,] "59.590708566118" "5" "LR_GDA_LLR_LEC"
## [6,] "53.1931626792032" "6" "LR_GDA_LLR_LEC"
## [7,] "48.758193032101" "7" "LR_GDA_LLR_LEC"
## [8,] "45.7046714572526" "8" "LR_GDA_LLR_LEC"
## [9,] "43.605750421321" "9" "LR_GDA_LLR_LEC"
## [10,] "42.1544665405458" "10" "LR_GDA_LLR_LEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and low epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_LLR_LEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_LLR_LEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Estimating the linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
= GradientDescent_LREstimation(y = y,
LR_GDA_LLR_HEC X = X,
LearningRate = 50,
Epochs = 50)
## [1] "Initializing Gradient Descent Algorithm Parameters."
## [1] "Gradient Descent Algorithm Reached Last Epoch Without Convergence."
## [1] "Minimum Threshold for Gradient Norm = 0.0001 Not Achieved."
## [1] "Final Gradient Norm Determined as 0.114815262946592 at Epoch 50"
<- LR_GDA_LLR_HEC
LR_GDA_LLR_HEC_Summary
##################################
# Consolidating all estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
<- as.data.frame(LR_GDA_LLR_HEC_Summary$LRCoefficients)
LR_GDA_LLR_HEC rownames(LR_GDA_LLR_HEC) <- NULL
colnames(LR_GDA_LLR_HEC) <- c("LRCoefficients")
$LRCoefficientNames <- c("Intercept",
LR_GDA_LLR_HEC"MolWeight",
"NumCarbon",
"NumChlorine",
"NumHalogen",
"NumMultBonds")
$EstimationMethod <- rep("LR_GDA_LLR_HEC",nrow(LR_GDA_LLR_HEC))
LR_GDA_LLR_HEC
##################################
# Summarizing the estimated
# linear regression coefficients
# using the gradient descent algorithm
# with low learning rate and high epoch count
##################################
print(LR_GDA_LLR_HEC)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185441 Intercept LR_GDA_LLR_HEC
## 2 0.2813439 MolWeight LR_GDA_LLR_HEC
## 3 -1.3371016 NumCarbon LR_GDA_LLR_HEC
## 4 -0.5127712 NumChlorine LR_GDA_LLR_HEC
## 5 -0.6839773 NumHalogen LR_GDA_LLR_HEC
## 6 -0.3113283 NumMultBonds LR_GDA_LLR_HEC
##################################
# Gathering the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and high epoch count
##################################
$Epoch <- 1:length(LR_GDA_LLR_HEC_Summary$L2Loss)
LR_GDA_LLR_HEC_Summary$Method <- rep("LR_GDA_LLR_HEC",length(LR_GDA_LLR_HEC_Summary$L2Loss))
LR_GDA_LLR_HEC_Summary
<- LR_GDA_LLR_HEC_Summary$L2Loss
L2Loss <- LR_GDA_LLR_HEC_Summary$Epoch
Epoch <- LR_GDA_LLR_HEC_Summary$Method
Method <- cbind(L2Loss, Epoch, Method)) (LR_GDA_LLR_HEC_ConsolidatedSummary
## L2Loss Epoch Method
## [1,] "128.868015504947" "1" "LR_GDA_LLR_HEC"
## [2,] "100.674236221077" "2" "LR_GDA_LLR_HEC"
## [3,] "81.8154777197929" "3" "LR_GDA_LLR_HEC"
## [4,] "68.7468688791543" "4" "LR_GDA_LLR_HEC"
## [5,] "59.590708566118" "5" "LR_GDA_LLR_HEC"
## [6,] "53.1931626792032" "6" "LR_GDA_LLR_HEC"
## [7,] "48.758193032101" "7" "LR_GDA_LLR_HEC"
## [8,] "45.7046714572526" "8" "LR_GDA_LLR_HEC"
## [9,] "43.605750421321" "9" "LR_GDA_LLR_HEC"
## [10,] "42.1544665405458" "10" "LR_GDA_LLR_HEC"
## [11,] "41.1364565433462" "11" "LR_GDA_LLR_HEC"
## [12,] "40.406033177599" "12" "LR_GDA_LLR_HEC"
## [13,] "39.8662448387947" "13" "LR_GDA_LLR_HEC"
## [14,] "39.4534953274598" "14" "LR_GDA_LLR_HEC"
## [15,] "39.126421847265" "15" "LR_GDA_LLR_HEC"
## [16,] "38.8582166343799" "16" "LR_GDA_LLR_HEC"
## [17,] "38.631490240265" "17" "LR_GDA_LLR_HEC"
## [18,] "38.4349057071761" "18" "LR_GDA_LLR_HEC"
## [19,] "38.2610024308783" "19" "LR_GDA_LLR_HEC"
## [20,] "38.1048009669459" "20" "LR_GDA_LLR_HEC"
## [21,] "37.9629129868599" "21" "LR_GDA_LLR_HEC"
## [22,] "37.8329749643241" "22" "LR_GDA_LLR_HEC"
## [23,] "37.7132881309214" "23" "LR_GDA_LLR_HEC"
## [24,] "37.6025894071465" "24" "LR_GDA_LLR_HEC"
## [25,] "37.4999053476352" "25" "LR_GDA_LLR_HEC"
## [26,] "37.4044586694964" "26" "LR_GDA_LLR_HEC"
## [27,] "37.3156081003806" "27" "LR_GDA_LLR_HEC"
## [28,] "37.2328093671795" "28" "LR_GDA_LLR_HEC"
## [29,] "37.1555896282248" "29" "LR_GDA_LLR_HEC"
## [30,] "37.0835304829609" "30" "LR_GDA_LLR_HEC"
## [31,] "37.0162564799321" "31" "LR_GDA_LLR_HEC"
## [32,] "36.9534271714769" "32" "LR_GDA_LLR_HEC"
## [33,] "36.8947314752451" "33" "LR_GDA_LLR_HEC"
## [34,] "36.8398835522538" "34" "LR_GDA_LLR_HEC"
## [35,] "36.7886196955981" "35" "LR_GDA_LLR_HEC"
## [36,] "36.7406959041593" "36" "LR_GDA_LLR_HEC"
## [37,] "36.6958859301698" "37" "LR_GDA_LLR_HEC"
## [38,] "36.6539796624992" "38" "LR_GDA_LLR_HEC"
## [39,] "36.6147817542687" "39" "LR_GDA_LLR_HEC"
## [40,] "36.5781104334971" "40" "LR_GDA_LLR_HEC"
## [41,] "36.5437964549883" "41" "LR_GDA_LLR_HEC"
## [42,] "36.5116821644221" "42" "LR_GDA_LLR_HEC"
## [43,] "36.4816206540301" "43" "LR_GDA_LLR_HEC"
## [44,] "36.4534749948634" "44" "LR_GDA_LLR_HEC"
## [45,] "36.4271175344747" "45" "LR_GDA_LLR_HEC"
## [46,] "36.4024292514622" "46" "LR_GDA_LLR_HEC"
## [47,] "36.3792991601602" "47" "LR_GDA_LLR_HEC"
## [48,] "36.3576237600811" "48" "LR_GDA_LLR_HEC"
## [49,] "36.3373065256698" "49" "LR_GDA_LLR_HEC"
## [50,] "36.3182574326452" "50" "LR_GDA_LLR_HEC"
##################################
# Plotting the loss function optimization data
# for the gradient descent algorithm
# with low learning rate and high epoch count
##################################
xyplot(L2Loss ~ Epoch,
data = LR_GDA_LLR_HEC_Summary,
main = "Loss Function Optimization Profile : LR_GDA_LLR_HEC",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Consolidating the loss function optimization data
# for all gradient descent algorithms
# with different learning rates and epoch counts
##################################
<- rbind(LR_GDA_VHLR_LEC_ConsolidatedSummary,
LR_GDA_ConsolidatedSummary
LR_GDA_VHLR_HEC_ConsolidatedSummary,
LR_GDA_HLR_LEC_ConsolidatedSummary,
LR_GDA_HLR_HEC_ConsolidatedSummary,
LR_GDA_LLR_LEC_ConsolidatedSummary,
LR_GDA_LLR_HEC_ConsolidatedSummary)
<- as.data.frame(LR_GDA_ConsolidatedSummary)
LR_GDA_ConsolidatedSummary
$L2Loss <- as.numeric(as.character(LR_GDA_ConsolidatedSummary$L2Loss))
LR_GDA_ConsolidatedSummary$Epoch <- as.numeric(as.character(LR_GDA_ConsolidatedSummary$Epoch))
LR_GDA_ConsolidatedSummary$Method <- factor(LR_GDA_ConsolidatedSummary$Method,
LR_GDA_ConsolidatedSummarylevels = c("LR_GDA_LLR_LEC",
"LR_GDA_HLR_LEC",
"LR_GDA_VHLR_LEC",
"LR_GDA_LLR_HEC",
"LR_GDA_HLR_HEC",
"LR_GDA_VHLR_HEC"))
##################################
# Plotting the loss function optimization data
# for all gradient descent algorithms
# with different learning rates and epoch counts
##################################
xyplot(L2Loss ~ Epoch | Method,
data = LR_GDA_ConsolidatedSummary,
main = "Loss Function Optimization Profile for Gradient Descent Algorithm with Different Learning Rates and Epoch Counts",
ylab = "L2 Loss",
xlab = "Epoch",
type=c("p"),
origin = 0,
alpha = 0.45,
pch = 16,
cex = 1,
xlim = c(0, 50),
ylim = c(30, 150))
##################################
# Gathering the estimated coefficients
# for normal equations and all gradient descent algorithms
# with different learning rates and epoch counts
##################################
<- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_VHLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_VHLR_HEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_HLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_HLR_HEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_LLR_LEC <- as.data.frame(LR_NE)
LR_NE_VS_LR_GDA_LLR_HEC
$Group <- rep("LR_NE Versus LR_GDA_VHLR_LEC",nrow(LR_NE_VS_LR_GDA_VHLR_LEC))
LR_NE_VS_LR_GDA_VHLR_LEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_HEC",nrow(LR_NE_VS_LR_GDA_VHLR_LEC))
LR_NE_VS_LR_GDA_VHLR_HEC$Group <- rep("LR_NE Versus LR_GDA_HLR_LEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_HLR_LEC$Group <- rep("LR_NE Versus LR_GDA_HLR_HEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_HLR_HEC$Group <- rep("LR_NE Versus LR_GDA_LLR_LEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_LLR_LEC$Group <- rep("LR_NE Versus LR_GDA_LLR_HEC",nrow(LR_NE_VS_LR_GDA_HLR_LEC))
LR_NE_VS_LR_GDA_LLR_HEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_LEC",nrow(LR_GDA_VHLR_LEC))
LR_GDA_VHLR_LEC$Group <- rep("LR_NE Versus LR_GDA_VHLR_HEC",nrow(LR_GDA_VHLR_HEC))
LR_GDA_VHLR_HEC$Group <- rep("LR_NE Versus LR_GDA_HLR_LEC",nrow(LR_GDA_HLR_LEC))
LR_GDA_HLR_LEC$Group <- rep("LR_NE Versus LR_GDA_HLR_HEC",nrow(LR_GDA_HLR_HEC))
LR_GDA_HLR_HEC$Group <- rep("LR_NE Versus LR_GDA_LLR_LEC",nrow(LR_GDA_LLR_LEC))
LR_GDA_LLR_LEC$Group <- rep("LR_NE Versus LR_GDA_LLR_HEC",nrow(LR_GDA_LLR_HEC))
LR_GDA_LLR_HEC
##################################
# Consolidating the estimated coefficients
# for normal equations and all gradient descent algorithms
# with different learning rates and epoch counts
##################################
<- rbind(LR_NE_VS_LR_GDA_VHLR_LEC,
LR_NE_GDA_ConsolidatedSummary
LR_NE_VS_LR_GDA_VHLR_HEC,
LR_NE_VS_LR_GDA_HLR_LEC,
LR_NE_VS_LR_GDA_HLR_HEC,
LR_NE_VS_LR_GDA_LLR_LEC,
LR_NE_VS_LR_GDA_LLR_HEC,
LR_GDA_VHLR_LEC,
LR_GDA_VHLR_HEC,
LR_GDA_HLR_LEC,
LR_GDA_HLR_HEC,
LR_GDA_LLR_LEC,
LR_GDA_LLR_HEC)
<- as.data.frame(LR_NE_GDA_ConsolidatedSummary)
LR_NE_GDA_ConsolidatedSummary
$LRCoefficients <- as.numeric(as.character(LR_NE_GDA_ConsolidatedSummary$LRCoefficients))
LR_NE_GDA_ConsolidatedSummary$Group <- factor(LR_NE_GDA_ConsolidatedSummary$Group,
LR_NE_GDA_ConsolidatedSummarylevels = c("LR_NE Versus LR_GDA_LLR_LEC",
"LR_NE Versus LR_GDA_HLR_LEC",
"LR_NE Versus LR_GDA_VHLR_LEC",
"LR_NE Versus LR_GDA_LLR_HEC",
"LR_NE Versus LR_GDA_HLR_HEC",
"LR_NE Versus LR_GDA_VHLR_HEC"))
$LRCoefficientNames <- factor(LR_NE_GDA_ConsolidatedSummary$LRCoefficientNames,
LR_NE_GDA_ConsolidatedSummarylevels = c("NumMultBonds",
"NumHalogen",
"NumChlorine",
"NumCarbon",
"MolWeight",
"Intercept"))
$EstimationMethod <- factor(LR_NE_GDA_ConsolidatedSummary$EstimationMethod,
LR_NE_GDA_ConsolidatedSummarylevels = c("LR_NE",
"LR_GDA_LLR_LEC",
"LR_GDA_LLR_HEC",
"LR_GDA_HLR_LEC",
"LR_GDA_HLR_HEC",
"LR_GDA_VHLR_LEC",
"LR_GDA_VHLR_HEC"))
print(LR_NE_GDA_ConsolidatedSummary)
## LRCoefficients LRCoefficientNames EstimationMethod
## 1 -2.7185699 Intercept LR_NE
## 2 0.2049318 MolWeight LR_NE
## 3 -1.2542520 NumCarbon LR_NE
## 4 -0.1441934 NumChlorine LR_NE
## 5 -1.0135099 NumHalogen LR_NE
## 6 -0.3304828 NumMultBonds LR_NE
## 7 -2.7185699 Intercept LR_NE
## 8 0.2049318 MolWeight LR_NE
## 9 -1.2542520 NumCarbon LR_NE
## 10 -0.1441934 NumChlorine LR_NE
## 11 -1.0135099 NumHalogen LR_NE
## 12 -0.3304828 NumMultBonds LR_NE
## 13 -2.7185699 Intercept LR_NE
## 14 0.2049318 MolWeight LR_NE
## 15 -1.2542520 NumCarbon LR_NE
## 16 -0.1441934 NumChlorine LR_NE
## 17 -1.0135099 NumHalogen LR_NE
## 18 -0.3304828 NumMultBonds LR_NE
## 19 -2.7185699 Intercept LR_NE
## 20 0.2049318 MolWeight LR_NE
## 21 -1.2542520 NumCarbon LR_NE
## 22 -0.1441934 NumChlorine LR_NE
## 23 -1.0135099 NumHalogen LR_NE
## 24 -0.3304828 NumMultBonds LR_NE
## 25 -2.7185699 Intercept LR_NE
## 26 0.2049318 MolWeight LR_NE
## 27 -1.2542520 NumCarbon LR_NE
## 28 -0.1441934 NumChlorine LR_NE
## 29 -1.0135099 NumHalogen LR_NE
## 30 -0.3304828 NumMultBonds LR_NE
## 31 -2.7185699 Intercept LR_NE
## 32 0.2049318 MolWeight LR_NE
## 33 -1.2542520 NumCarbon LR_NE
## 34 -0.1441934 NumChlorine LR_NE
## 35 -1.0135099 NumHalogen LR_NE
## 36 -0.3304828 NumMultBonds LR_NE
## 37 -2.7185699 Intercept LR_GDA_VHLR_LEC
## 38 0.2684003 MolWeight LR_GDA_VHLR_LEC
## 39 -1.4126207 NumCarbon LR_GDA_VHLR_LEC
## 40 -0.6009731 NumChlorine LR_GDA_VHLR_LEC
## 41 -0.6815776 NumHalogen LR_GDA_VHLR_LEC
## 42 -0.3415085 NumMultBonds LR_GDA_VHLR_LEC
## 43 -2.7185699 Intercept LR_GDA_VHLR_HEC
## 44 -1.2694291 MolWeight LR_GDA_VHLR_HEC
## 45 -2.3691976 NumCarbon LR_GDA_VHLR_HEC
## 46 -1.2181515 NumChlorine LR_GDA_VHLR_HEC
## 47 -2.0024255 NumHalogen LR_GDA_VHLR_HEC
## 48 -1.4272592 NumMultBonds LR_GDA_VHLR_HEC
## 49 -2.7055537 Intercept LR_GDA_HLR_LEC
## 50 0.6047579 MolWeight LR_GDA_HLR_LEC
## 51 -1.6363423 NumCarbon LR_GDA_HLR_LEC
## 52 -0.8489932 NumChlorine LR_GDA_HLR_LEC
## 53 -0.4926584 NumHalogen LR_GDA_HLR_LEC
## 54 -0.2958283 NumMultBonds LR_GDA_HLR_LEC
## 55 -2.7185699 Intercept LR_GDA_HLR_HEC
## 56 0.1698594 MolWeight LR_GDA_HLR_HEC
## 57 -1.2219687 NumCarbon LR_GDA_HLR_HEC
## 58 -0.2977972 NumChlorine LR_GDA_HLR_HEC
## 59 -0.8462708 NumHalogen LR_GDA_HLR_HEC
## 60 -0.3310979 NumMultBonds LR_GDA_HLR_HEC
## 61 -2.3922400 Intercept LR_GDA_LLR_LEC
## 62 0.8939686 MolWeight LR_GDA_LLR_LEC
## 63 -1.8435282 NumCarbon LR_GDA_LLR_LEC
## 64 -1.0691289 NumChlorine LR_GDA_LLR_LEC
## 65 -0.3806898 NumHalogen LR_GDA_LLR_LEC
## 66 -0.3634163 NumMultBonds LR_GDA_LLR_LEC
## 67 -2.7185441 Intercept LR_GDA_LLR_HEC
## 68 0.2813439 MolWeight LR_GDA_LLR_HEC
## 69 -1.3371016 NumCarbon LR_GDA_LLR_HEC
## 70 -0.5127712 NumChlorine LR_GDA_LLR_HEC
## 71 -0.6839773 NumHalogen LR_GDA_LLR_HEC
## 72 -0.3113283 NumMultBonds LR_GDA_LLR_HEC
## Group
## 1 LR_NE Versus LR_GDA_VHLR_LEC
## 2 LR_NE Versus LR_GDA_VHLR_LEC
## 3 LR_NE Versus LR_GDA_VHLR_LEC
## 4 LR_NE Versus LR_GDA_VHLR_LEC
## 5 LR_NE Versus LR_GDA_VHLR_LEC
## 6 LR_NE Versus LR_GDA_VHLR_LEC
## 7 LR_NE Versus LR_GDA_VHLR_HEC
## 8 LR_NE Versus LR_GDA_VHLR_HEC
## 9 LR_NE Versus LR_GDA_VHLR_HEC
## 10 LR_NE Versus LR_GDA_VHLR_HEC
## 11 LR_NE Versus LR_GDA_VHLR_HEC
## 12 LR_NE Versus LR_GDA_VHLR_HEC
## 13 LR_NE Versus LR_GDA_HLR_LEC
## 14 LR_NE Versus LR_GDA_HLR_LEC
## 15 LR_NE Versus LR_GDA_HLR_LEC
## 16 LR_NE Versus LR_GDA_HLR_LEC
## 17 LR_NE Versus LR_GDA_HLR_LEC
## 18 LR_NE Versus LR_GDA_HLR_LEC
## 19 LR_NE Versus LR_GDA_HLR_HEC
## 20 LR_NE Versus LR_GDA_HLR_HEC
## 21 LR_NE Versus LR_GDA_HLR_HEC
## 22 LR_NE Versus LR_GDA_HLR_HEC
## 23 LR_NE Versus LR_GDA_HLR_HEC
## 24 LR_NE Versus LR_GDA_HLR_HEC
## 25 LR_NE Versus LR_GDA_LLR_LEC
## 26 LR_NE Versus LR_GDA_LLR_LEC
## 27 LR_NE Versus LR_GDA_LLR_LEC
## 28 LR_NE Versus LR_GDA_LLR_LEC
## 29 LR_NE Versus LR_GDA_LLR_LEC
## 30 LR_NE Versus LR_GDA_LLR_LEC
## 31 LR_NE Versus LR_GDA_LLR_HEC
## 32 LR_NE Versus LR_GDA_LLR_HEC
## 33 LR_NE Versus LR_GDA_LLR_HEC
## 34 LR_NE Versus LR_GDA_LLR_HEC
## 35 LR_NE Versus LR_GDA_LLR_HEC
## 36 LR_NE Versus LR_GDA_LLR_HEC
## 37 LR_NE Versus LR_GDA_VHLR_LEC
## 38 LR_NE Versus LR_GDA_VHLR_LEC
## 39 LR_NE Versus LR_GDA_VHLR_LEC
## 40 LR_NE Versus LR_GDA_VHLR_LEC
## 41 LR_NE Versus LR_GDA_VHLR_LEC
## 42 LR_NE Versus LR_GDA_VHLR_LEC
## 43 LR_NE Versus LR_GDA_VHLR_HEC
## 44 LR_NE Versus LR_GDA_VHLR_HEC
## 45 LR_NE Versus LR_GDA_VHLR_HEC
## 46 LR_NE Versus LR_GDA_VHLR_HEC
## 47 LR_NE Versus LR_GDA_VHLR_HEC
## 48 LR_NE Versus LR_GDA_VHLR_HEC
## 49 LR_NE Versus LR_GDA_HLR_LEC
## 50 LR_NE Versus LR_GDA_HLR_LEC
## 51 LR_NE Versus LR_GDA_HLR_LEC
## 52 LR_NE Versus LR_GDA_HLR_LEC
## 53 LR_NE Versus LR_GDA_HLR_LEC
## 54 LR_NE Versus LR_GDA_HLR_LEC
## 55 LR_NE Versus LR_GDA_HLR_HEC
## 56 LR_NE Versus LR_GDA_HLR_HEC
## 57 LR_NE Versus LR_GDA_HLR_HEC
## 58 LR_NE Versus LR_GDA_HLR_HEC
## 59 LR_NE Versus LR_GDA_HLR_HEC
## 60 LR_NE Versus LR_GDA_HLR_HEC
## 61 LR_NE Versus LR_GDA_LLR_LEC
## 62 LR_NE Versus LR_GDA_LLR_LEC
## 63 LR_NE Versus LR_GDA_LLR_LEC
## 64 LR_NE Versus LR_GDA_LLR_LEC
## 65 LR_NE Versus LR_GDA_LLR_LEC
## 66 LR_NE Versus LR_GDA_LLR_LEC
## 67 LR_NE Versus LR_GDA_LLR_HEC
## 68 LR_NE Versus LR_GDA_LLR_HEC
## 69 LR_NE Versus LR_GDA_LLR_HEC
## 70 LR_NE Versus LR_GDA_LLR_HEC
## 71 LR_NE Versus LR_GDA_LLR_HEC
## 72 LR_NE Versus LR_GDA_LLR_HEC
dotplot(LRCoefficientNames ~ LRCoefficients | Group,
data = LR_NE_GDA_ConsolidatedSummary,
groups = EstimationMethod,
main = "Estimated Linear Regression Coefficient Value Comparison",
ylab = "Linear Regression Coefficients",
xlab = "Estimated Linear Regression Coefficient Values",
auto.key = list(adj = 1),
type = c("p", "h"),
# origin = 0,
alpha = 0.45,
pch = 16,
cex = 2)