##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(moments)
library(skimr)
library(dplyr)
library(RANN)
library(corrplot)
library(lares)
library(DMwR2)
##################################
# Loading dataset
##################################
data(schedulingData)
##################################
# Performing a general exploration of the dataset
##################################
dim(schedulingData)
## [1] 4331 8
str(schedulingData)
## 'data.frame': 4331 obs. of 8 variables:
## $ Protocol : Factor w/ 14 levels "A","C","D","E",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Compounds : num 997 97 101 93 100 100 105 98 101 95 ...
## $ InputFields: num 137 103 75 76 82 82 88 95 91 92 ...
## $ Iterations : num 20 20 10 20 20 20 20 20 20 20 ...
## $ NumPending : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hour : num 14 13.8 13.8 10.1 10.4 ...
## $ Day : Factor w/ 7 levels "Mon","Tue","Wed",..: 2 2 4 5 5 3 5 5 5 3 ...
## $ Class : Factor w/ 4 levels "VF","F","M","L": 2 1 1 1 1 1 1 1 1 1 ...
summary(schedulingData)
## Protocol Compounds InputFields Iterations
## J : 989 Min. : 20.0 Min. : 10 Min. : 10.00
## O : 581 1st Qu.: 98.0 1st Qu.: 134 1st Qu.: 20.00
## N : 536 Median : 226.0 Median : 426 Median : 20.00
## M : 451 Mean : 497.7 Mean : 1537 Mean : 29.24
## I : 381 3rd Qu.: 448.0 3rd Qu.: 991 3rd Qu.: 20.00
## H : 321 Max. :14103.0 Max. :56671 Max. :200.00
## (Other):1072
## NumPending Hour Day Class
## Min. : 0.00 Min. : 0.01667 Mon:692 VF:2211
## 1st Qu.: 0.00 1st Qu.:10.90000 Tue:900 F :1347
## Median : 0.00 Median :14.01667 Wed:903 M : 514
## Mean : 53.39 Mean :13.73376 Thu:720 L : 259
## 3rd Qu.: 0.00 3rd Qu.:16.60000 Fri:923
## Max. :5605.00 Max. :23.98333 Sat: 32
## Sun:161
##################################
# Formulating a data type assessment summary
##################################
<- schedulingData
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 Protocol factor
## 2 2 Compounds numeric
## 3 3 InputFields numeric
## 4 4 Iterations numeric
## 5 5 NumPending numeric
## 6 6 Hour numeric
## 7 7 Day factor
## 8 8 Class factor
##################################
# Loading dataset
##################################
<- schedulingData
DQA
##################################
# Listing all predictors
##################################
<- DQA[,!names(DQA) %in% c("Class")]
DQA.Predictors
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 Protocol factor 4331 0 1.000
## 2 2 Compounds numeric 4331 0 1.000
## 3 3 InputFields numeric 4331 0 1.000
## 4 4 Iterations numeric 4331 0 1.000
## 5 5 NumPending numeric 4331 0 1.000
## 6 6 Hour numeric 4331 0 1.000
## 7 7 Day factor 4331 0 1.000
## 8 8 Class factor 4331 0 1.000
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 5 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are 2 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm == max(tabsm)]
usm[tabsm
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 Protocol factor 14 J O
## 2 Day factor 7 Fri Wed
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio First.Second.Mode.Ratio
## 1 989 581 0.003 1.702
## 2 923 903 0.002 1.022
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm == max(tabsm)]
usm[tabsm
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 Compounds numeric 858 0.198 20.000
## 2 InputFields numeric 1730 0.399 10.000
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## 5 Hour numeric 924 0.213 13.083
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 31.000 96 29 3.310
## 2 466.000 82 27 3.037
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## 5 21.067 28 25 1.120
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 20.000 497.742 226.000 14103.000 6.568 69.486 98.000
## 2 10.000 1537.055 426.000 56671.000 5.870 54.919 134.000
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## 5 0.017 13.734 14.017 23.983 -0.546 3.747 10.900
## Percentile75th
## 1 448.000
## 2 991.000
## 3 20.000
## 4 0.000
## 5 16.600
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance factor predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## Percentile75th
## 3 20.000
## 4 0.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "Low variance observed for 1 numeric variable(s) with Unique.Count.Ratio<0.01."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 Iterations numeric 11 0.003 20.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 10.000 3568 272 13.118
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000 20.000
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "High skewness observed for 4 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 Compounds numeric 858 0.198 20.000
## 2 InputFields numeric 1730 0.399 10.000
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 31.000 96 29 3.310
## 2 466.000 82 27 3.037
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 20.000 497.742 226.000 14103.000 6.568 69.486 98.000
## 2 10.000 1537.055 426.000 56671.000 5.870 54.919 134.000
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## Percentile75th
## 1 448.000
## 2 991.000
## 3 20.000
## 4 0.000
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA)) (DPA_Skimmed
Name | DPA |
Number of rows | 4331 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
factor | 3 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Identifying columns with missing data
#################################
%>%
DPA skim() %>%
::filter(n_missing > 0) dplyr
## # A tibble: 0 x 15
## # i 15 variables: skim_type <chr>, skim_variable <chr>, n_missing <int>,
## # complete_rate <dbl>, factor.ordered <lgl>, factor.n_unique <int>,
## # factor.top_counts <chr>, numeric.mean <dbl>, numeric.sd <dbl>,
## # numeric.p0 <dbl>, numeric.p25 <dbl>, numeric.p50 <dbl>, numeric.p75 <dbl>,
## # numeric.p100 <dbl>, numeric.hist <chr>
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA.Predictors.Numeric[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "5 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a center, scale and spatial sign data transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("center","scale","spatialSign"))
DPA_CenteredScaledSpatialSigned <- predict(DPA_CenteredScaledSpatialSigned, DPA.Predictors.Numeric)
DPA_CenteredScaledSpatialSignedTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_CenteredScaledSpatialSignedTransformed)) (DPA_CenteredScaledSpatialSignedTransformedSkimmed
Name | DPA_CenteredScaledSpatial… |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | -0.14 | 0.37 | -0.82 | -0.39 | -0.21 | -0.04 | 1.00 | ▂▇▃▁▁ |
InputFields | 0 | 1 | -0.16 | 0.41 | -0.81 | -0.41 | -0.25 | -0.07 | 1.00 | ▃▇▂▁▂ |
Iterations | 0 | 1 | -0.18 | 0.37 | -0.92 | -0.38 | -0.25 | -0.14 | 1.00 | ▁▇▂▁▁ |
NumPending | 0 | 1 | -0.09 | 0.21 | -0.46 | -0.18 | -0.13 | -0.06 | 1.00 | ▃▇▁▁▁ |
Hour | 0 | 1 | 0.04 | 0.65 | -0.99 | -0.63 | 0.08 | 0.70 | 0.99 | ▇▃▅▃▇ |
##################################
# Identifying outliers for the numeric predictors
##################################
<- c()
OutlierCountList
for (i in 1:ncol(DPA.Predictors.Numeric)) {
<- boxplot.stats(DPA_CenteredScaledSpatialSignedTransformed[,i])$out
Outliers <- length(Outliers)
OutlierCount <- append(OutlierCountList,OutlierCount)
OutlierCountList <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
OutlierIndices boxplot(DPA_CenteredScaledSpatialSignedTransformed[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}
<- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
OutlierCountSummary names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
OutlierCountSummary<- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
NumericPredictorWithOutlierCount print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))
## [1] "4 numeric variable(s) were noted with outlier(s)."
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA)) (DPA_Skimmed
Name | DPA |
Number of rows | 4331 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
factor | 3 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Identifying columns with low variance
###################################
<- nearZeroVar(DPA,
DPA_LowVariance freqCut = 80/20,
uniqueCut = 10,
saveMetrics= TRUE)
$nzv,]) (DPA_LowVariance[DPA_LowVariance
## freqRatio percentUnique zeroVar nzv
## Iterations 13.11765 0.2539829 FALSE TRUE
## NumPending 19.84848 6.9960748 FALSE TRUE
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
else {
}
print(paste0("Low variance observed for ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
<- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
DPA_LowVarianceForRemoval
print(paste0("Low variance can be resolved by removing ",
nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
(" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
<- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
DPA_LowVarianceRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LowVarianceRemovedVariable))
}
%>%
DPA skim() %>%
::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
dplyr
##################################
# Filtering out columns with low variance
#################################
<- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
DPA_ExcludedLowVariance
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLowVariance))
(DPA_ExcludedLowVariance_Skimmed }
## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."
## [1] "Low variance can be resolved by removing 2 numeric variable(s)."
## [1] "Variable 1 for removal: Iterations"
## [1] "Variable 2 for removal: NumPending"
Name | DPA_ExcludedLowVariance |
Number of rows | 4331 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
factor | 3 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Visualizing pairwise correlation between predictors
##################################
<- cor.mtest(DPA.Predictors.Numeric,
DPA_CorrelationTest method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")
##################################
# Identifying the highly correlated variables
##################################
<- cor(DPA.Predictors.Numeric,
DPA_Correlation method = "pearson",
use="pairwise.complete.obs")
<- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95)) (DPA_HighlyCorrelatedCount
## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
else {
} print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount)," pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
<- corr_cross(DPA.Predictors.Numeric,
(DPA_HighlyCorrelatedPairs max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}
## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
<- findCorrelation(DPA_Correlation, cutoff = 0.95)
DPA_HighlyCorrelated
<- length(DPA_HighlyCorrelated))
(DPA_HighlyCorrelatedForRemoval
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval)," numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
DPA_HighlyCorrelatedRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
<- DPA[,-DPA_HighlyCorrelated]
DPA_ExcludedHighCorrelation
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedHighCorrelation))
(DPA_ExcludedHighCorrelation_Skimmed
}
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Finding linear dependencies
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
##################################
# Identifying the linearly dependent variables
##################################
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$linearCombos)) (DPA_LinearlyDependentCount
## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
else {
} print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount)," subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
DPA_LinearlyDependentSubset print(paste0("Linear dependent variable(s) for subset ",
i," include: ",
DPA_LinearlyDependentSubset))
}
}
## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
<- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependent
<- length(DPA_LinearlyDependent$remove)
DPA_LinearlyDependentForRemoval
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval)," numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
<- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
DPA_LinearlyDependentRemovedVariable print(paste0("Variable ",
j," for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
<- DPA.Predictors.Numeric[,-DPA_LinearlyDependent$remove]
DPA_ExcludedLinearlyDependent
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExcludedLinearlyDependent))
(DPA_ExcludedLinearlyDependent_Skimmed
}
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a center transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("center"))
DPA_Centered <- predict(DPA_Centered, DPA.Predictors.Numeric)
DPA_CenteredTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_CenteredTransformed)) (DPA_CenteredTransformedSkimmed
Name | DPA_CenteredTransformed |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 0 | 1020.17 | -477.74 | -399.74 | -271.74 | -49.74 | 13605.26 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 0 | 3650.08 | -1527.06 | -1403.06 | -1111.06 | -546.06 | 55133.94 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 0 | 34.42 | -19.24 | -9.24 | -9.24 | -9.24 | 170.76 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 0 | 355.96 | -53.39 | -53.39 | -53.39 | -53.39 | 5551.61 | ▇▁▁▁▁ |
Hour | 0 | 1 | 0 | 3.98 | -13.72 | -2.83 | 0.28 | 2.87 | 10.25 | ▁▂▇▇▁ |
##################################
# Applying a center and scale data transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("center","scale"))
DPA_CenteredScaled <- predict(DPA_CenteredScaled, DPA.Predictors.Numeric)
DPA_CenteredScaledTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_CenteredScaledTransformed)) (DPA_CenteredScaledTransformedSkimmed
Name | DPA_CenteredScaledTransfo… |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 0 | 1 | -0.47 | -0.39 | -0.27 | -0.05 | 13.34 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 0 | 1 | -0.42 | -0.38 | -0.30 | -0.15 | 15.10 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 0 | 1 | -0.56 | -0.27 | -0.27 | -0.27 | 4.96 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 0 | 1 | -0.15 | -0.15 | -0.15 | -0.15 | 15.60 | ▇▁▁▁▁ |
Hour | 0 | 1 | 0 | 1 | -3.45 | -0.71 | 0.07 | 0.72 | 2.57 | ▁▂▇▇▁ |
##################################
# Applying a range transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("range"), rangeBounds = c(0, 1))
DPA_Ranged <- predict(DPA_Ranged, DPA.Predictors.Numeric)
DPA_RangedTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_RangedTransformed)) (DPA_RangedTransformedSkimmed
Name | DPA_RangedTransformed |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 0.03 | 0.07 | 0 | 0.01 | 0.01 | 0.03 | 1 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 0.03 | 0.06 | 0 | 0.00 | 0.01 | 0.02 | 1 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 0.10 | 0.18 | 0 | 0.05 | 0.05 | 0.05 | 1 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 0.01 | 0.06 | 0 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
Hour | 0 | 1 | 0.57 | 0.17 | 0 | 0.45 | 0.58 | 0.69 | 1 | ▁▂▇▇▁ |
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA.Predictors.Numeric)) (DPA_Skimmed
Name | DPA.Predictors.Numeric |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a Box-Cox transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCox <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
DPA_BoxCoxTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_BoxCoxTransformed)) (DPA_BoxCoxTransformedSkimmed
Name | DPA_BoxCoxTransformed |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 5.36 | 1.25 | 3.00 | 4.58 | 5.42 | 6.10 | 9.55 | ▅▇▇▂▁ |
InputFields | 0 | 1 | 5.98 | 1.65 | 2.30 | 4.90 | 6.05 | 6.90 | 10.95 | ▂▆▇▃▁ |
Iterations | 0 | 1 | 0.95 | 0.02 | 0.90 | 0.95 | 0.95 | 0.95 | 1.00 | ▁▁▇▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.00 | 0.00 | 0.00 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 22.83 | 8.41 | -0.77 | 16.40 | 23.04 | 28.89 | 47.09 | ▁▆▇▆▁ |
##################################
# Applying a Yeo-Johnson transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("YeoJohnson"))
DPA_YeoJohnson <- predict(DPA_YeoJohnson, DPA.Predictors.Numeric)
DPA_YeoJohnsonTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_YeoJohnsonTransformed)) (DPA_YeoJohnsonTransformedSkimmed
Name | DPA_YeoJohnsonTransformed |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 4.32 | 0.80 | 2.70 | 3.84 | 4.40 | 4.83 | 6.67 | ▃▅▇▂▁ |
InputFields | 0 | 1 | 5.40 | 1.35 | 2.31 | 4.53 | 5.49 | 6.17 | 9.19 | ▂▅▇▃▁ |
Iterations | 0 | 1 | 0.92 | 0.01 | 0.88 | 0.92 | 0.92 | 0.92 | 0.95 | ▁▁▇▁▁ |
NumPending | 0 | 1 | 0.19 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 0.91 | ▇▁▁▁▂ |
Hour | 0 | 1 | 33.41 | 12.40 | 0.02 | 23.80 | 33.53 | 42.30 | 70.46 | ▁▆▇▅▁ |
##################################
# Applying an exponential transformation
##################################
<- preProcess(DPA.Predictors.Numeric, method = c("expoTrans"))
DPA_ExpoTrans <- predict(DPA_ExpoTrans, DPA.Predictors.Numeric)
DPA_ExpoTransTransformed
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_ExpoTransTransformed)) (DPA_ExpoTransTransformedSkimmed
Name | DPA_ExpoTransTransformed |
Number of rows | 4331 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.00 | 226.00 | 448.00 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.00 | 426.00 | 991.00 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 12.31 | 2.08 | 7.64 | 12.00 | 12.00 | 12.00 | 17.75 | ▁▁▇▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.00 | 0.00 | 0.00 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 19.04 | 6.86 | 0.02 | 13.76 | 18.98 | 23.83 | 40.95 | ▁▆▇▃▁ |
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.factor)]
DPA.Predictors.Factor
##################################
# Applying dummy variable creation
##################################
if (length(names(DPA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DPA.Predictors.Factor))),
(" factor variables for dummy variable creation."))
<- dummyVars(Class ~ ., data = DPA)
DPA_DummyVariables <- predict(DPA_DummyVariables, DPA)
DPA_DummyVariablesCreated
##################################
# Gathering descriptive statistics
##################################
<- skim(DPA_DummyVariablesCreated))
(DPA_DummyVariablesCreatedSkimmed
else {
} print("There are no factor variables for dummy variable creation.")
}
## [1] "There are 2 factor variables for dummy variable creation."
Name | DPA_DummyVariablesCreated |
Number of rows | 4331 |
Number of columns | 26 |
_______________________ | |
Column type frequency: | |
numeric | 26 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Protocol.A | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.C | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.D | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.E | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.F | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.G | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.H | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.I | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.J | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Protocol.K | 0 | 1 | 0.00 | 0.04 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.L | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.M | 0 | 1 | 0.10 | 0.31 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.N | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Protocol.O | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
Day.Mon | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Day.Tue | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Day.Wed | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Day.Thu | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Day.Fri | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
Day.Sat | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
Day.Sun | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
##################################
# Loading dataset
##################################
<- schedulingData
DPA
##################################
# Listing all predictors
##################################
<- DPA[,!names(DPA) %in% c("Class")]
DPA.Predictors
##################################
# Listing all numeric predictors
##################################
<- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
DPA.Predictors.Numeric ncol(DPA.Predictors.Numeric)
## [1] 5
##################################
# Converting response variable data type to factor
##################################
$Class <- as.factor(DPA$Class)
DPAlength(levels(DPA$Class))
## [1] 4
##################################
# Formulating the box plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))))
##################################
# Formulating the strip plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "strip",
jitter = TRUE,
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))))
##################################
# Formulating the density plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))),
auto.key = list(columns = (length(levels(DPA$Class)))))