##################################
# Loading R libraries
##################################
library(AppliedPredictiveModeling)
library(caret)
library(moments)
library(skimr)
library(dplyr)
library(RANN)
library(corrplot)
library(lares)
library(DMwR2)
##################################
# Loading dataset
##################################
data(schedulingData)
##################################
# Performing a general exploration of the dataset
##################################
dim(schedulingData)## [1] 4331 8
str(schedulingData)## 'data.frame': 4331 obs. of 8 variables:
## $ Protocol : Factor w/ 14 levels "A","C","D","E",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Compounds : num 997 97 101 93 100 100 105 98 101 95 ...
## $ InputFields: num 137 103 75 76 82 82 88 95 91 92 ...
## $ Iterations : num 20 20 10 20 20 20 20 20 20 20 ...
## $ NumPending : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hour : num 14 13.8 13.8 10.1 10.4 ...
## $ Day : Factor w/ 7 levels "Mon","Tue","Wed",..: 2 2 4 5 5 3 5 5 5 3 ...
## $ Class : Factor w/ 4 levels "VF","F","M","L": 2 1 1 1 1 1 1 1 1 1 ...
summary(schedulingData)## Protocol Compounds InputFields Iterations
## J : 989 Min. : 20.0 Min. : 10 Min. : 10.00
## O : 581 1st Qu.: 98.0 1st Qu.: 134 1st Qu.: 20.00
## N : 536 Median : 226.0 Median : 426 Median : 20.00
## M : 451 Mean : 497.7 Mean : 1537 Mean : 29.24
## I : 381 3rd Qu.: 448.0 3rd Qu.: 991 3rd Qu.: 20.00
## H : 321 Max. :14103.0 Max. :56671 Max. :200.00
## (Other):1072
## NumPending Hour Day Class
## Min. : 0.00 Min. : 0.01667 Mon:692 VF:2211
## 1st Qu.: 0.00 1st Qu.:10.90000 Tue:900 F :1347
## Median : 0.00 Median :14.01667 Wed:903 M : 514
## Mean : 53.39 Mean :13.73376 Thu:720 L : 259
## 3rd Qu.: 0.00 3rd Qu.:16.60000 Fri:923
## Max. :5605.00 Max. :23.98333 Sat: 32
## Sun:161
##################################
# Formulating a data type assessment summary
##################################
PDA <- schedulingData
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)## Column.Index Column.Name Column.Type
## 1 1 Protocol factor
## 2 2 Compounds numeric
## 3 3 InputFields numeric
## 4 4 Iterations numeric
## 5 5 NumPending numeric
## 6 6 Hour numeric
## 7 7 Day factor
## 8 8 Class factor
##################################
# Loading dataset
##################################
DQA <- schedulingData
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("Class")]
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 Protocol factor 4331 0 1.000
## 2 2 Compounds numeric 4331 0 1.000
## 3 3 InputFields numeric 4331 0 1.000
## 4 4 Iterations numeric 4331 0 1.000
## 5 5 NumPending numeric 4331 0 1.000
## 6 6 Hour numeric 4331 0 1.000
## 7 7 Day factor 4331 0 1.000
## 8 8 Class factor 4331 0 1.000
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}## [1] "There are 5 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}## [1] "There are 2 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
} ## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 Protocol factor 14 J O
## 2 Day factor 7 Fri Wed
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio First.Second.Mode.Ratio
## 1 989 581 0.003 1.702
## 2 923 903 0.002 1.022
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 Compounds numeric 858 0.198 20.000
## 2 InputFields numeric 1730 0.399 10.000
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## 5 Hour numeric 924 0.213 13.083
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 31.000 96 29 3.310
## 2 466.000 82 27 3.037
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## 5 21.067 28 25 1.120
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 20.000 497.742 226.000 14103.000 6.568 69.486 98.000
## 2 10.000 1537.055 426.000 56671.000 5.870 54.919 134.000
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## 5 0.017 13.734 14.017 23.983 -0.546 3.747 10.900
## Percentile75th
## 1 448.000
## 2 991.000
## 3 20.000
## 4 0.000
## 5 16.600
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}## [1] "No low variance factor predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## Percentile75th
## 3 20.000
## 4 0.000
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}## [1] "Low variance observed for 1 numeric variable(s) with Unique.Count.Ratio<0.01."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 3 Iterations numeric 11 0.003 20.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 3 10.000 3568 272 13.118
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000 20.000
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}## [1] "High skewness observed for 4 numeric variable(s) with Skewness>3 or Skewness<(-3)."
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 Compounds numeric 858 0.198 20.000
## 2 InputFields numeric 1730 0.399 10.000
## 3 Iterations numeric 11 0.003 20.000
## 4 NumPending numeric 303 0.070 0.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 31.000 96 29 3.310
## 2 466.000 82 27 3.037
## 3 10.000 3568 272 13.118
## 4 1.000 3275 165 19.848
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 20.000 497.742 226.000 14103.000 6.568 69.486 98.000
## 2 10.000 1537.055 426.000 56671.000 5.870 54.919 134.000
## 3 10.000 29.244 20.000 200.000 3.937 18.510 20.000
## 4 0.000 53.389 0.000 5605.000 9.718 105.594 0.000
## Percentile75th
## 1 448.000
## 2 991.000
## 3 20.000
## 4 0.000
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA))| Name | DPA |
| Number of rows | 4331 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
| Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
| Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Identifying columns with missing data
#################################
DPA %>%
skim() %>%
dplyr::filter(n_missing > 0)## # A tibble: 0 x 15
## # i 15 variables: skim_type <chr>, skim_variable <chr>, n_missing <int>,
## # complete_rate <dbl>, factor.ordered <lgl>, factor.n_unique <int>,
## # factor.top_counts <chr>, numeric.mean <dbl>, numeric.sd <dbl>,
## # numeric.p0 <dbl>, numeric.p25 <dbl>, numeric.p50 <dbl>, numeric.p75 <dbl>,
## # numeric.p100 <dbl>, numeric.hist <chr>
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Identifying outliers for the numeric predictors
##################################
OutlierCountList <- c()
for (i in 1:ncol(DPA.Predictors.Numeric)) {
Outliers <- boxplot.stats(DPA.Predictors.Numeric[,i])$out
OutlierCount <- length(Outliers)
OutlierCountList <- append(OutlierCountList,OutlierCount)
OutlierIndices <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
boxplot(DPA.Predictors.Numeric[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}OutlierCountSummary <- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
OutlierCountSummary$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
NumericPredictorWithOutlierCount <- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))## [1] "5 numeric variable(s) were noted with outlier(s)."
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))| Name | DPA.Predictors.Numeric |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a center, scale and spatial sign data transformation
##################################
DPA_CenteredScaledSpatialSigned <- preProcess(DPA.Predictors.Numeric, method = c("center","scale","spatialSign"))
DPA_CenteredScaledSpatialSignedTransformed <- predict(DPA_CenteredScaledSpatialSigned, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_CenteredScaledSpatialSignedTransformedSkimmed <- skim(DPA_CenteredScaledSpatialSignedTransformed))| Name | DPA_CenteredScaledSpatial… |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | -0.14 | 0.37 | -0.82 | -0.39 | -0.21 | -0.04 | 1.00 | ▂▇▃▁▁ |
| InputFields | 0 | 1 | -0.16 | 0.41 | -0.81 | -0.41 | -0.25 | -0.07 | 1.00 | ▃▇▂▁▂ |
| Iterations | 0 | 1 | -0.18 | 0.37 | -0.92 | -0.38 | -0.25 | -0.14 | 1.00 | ▁▇▂▁▁ |
| NumPending | 0 | 1 | -0.09 | 0.21 | -0.46 | -0.18 | -0.13 | -0.06 | 1.00 | ▃▇▁▁▁ |
| Hour | 0 | 1 | 0.04 | 0.65 | -0.99 | -0.63 | 0.08 | 0.70 | 0.99 | ▇▃▅▃▇ |
##################################
# Identifying outliers for the numeric predictors
##################################
OutlierCountList <- c()
for (i in 1:ncol(DPA.Predictors.Numeric)) {
Outliers <- boxplot.stats(DPA_CenteredScaledSpatialSignedTransformed[,i])$out
OutlierCount <- length(Outliers)
OutlierCountList <- append(OutlierCountList,OutlierCount)
OutlierIndices <- which(DPA.Predictors.Numeric[,i] %in% c(Outliers))
boxplot(DPA_CenteredScaledSpatialSignedTransformed[,i],
ylab = names(DPA.Predictors.Numeric)[i],
main = names(DPA.Predictors.Numeric)[i],
horizontal=TRUE)
mtext(paste0(OutlierCount, " Outlier(s) Detected"))
}OutlierCountSummary <- as.data.frame(cbind(names(DPA.Predictors.Numeric),(OutlierCountList)))
names(OutlierCountSummary) <- c("NumericPredictors","OutlierCount")
OutlierCountSummary$OutlierCount <- as.numeric(as.character(OutlierCountSummary$OutlierCount))
NumericPredictorWithOutlierCount <- nrow(OutlierCountSummary[OutlierCountSummary$OutlierCount>0,])
print(paste0(NumericPredictorWithOutlierCount, " numeric variable(s) were noted with outlier(s)." ))## [1] "4 numeric variable(s) were noted with outlier(s)."
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA))| Name | DPA |
| Number of rows | 4331 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
| Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
| Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Identifying columns with low variance
###################################
DPA_LowVariance <- nearZeroVar(DPA,
freqCut = 80/20,
uniqueCut = 10,
saveMetrics= TRUE)
(DPA_LowVariance[DPA_LowVariance$nzv,])## freqRatio percentUnique zeroVar nzv
## Iterations 13.11765 0.2539829 FALSE TRUE
## NumPending 19.84848 6.9960748 FALSE TRUE
if ((nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))==0){
print("No low variance predictors noted.")
} else {
print(paste0("Low variance observed for ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."))
DPA_LowVarianceForRemoval <- (nrow(DPA_LowVariance[DPA_LowVariance$nzv,]))
print(paste0("Low variance can be resolved by removing ",
(nrow(DPA_LowVariance[DPA_LowVariance$nzv,])),
" numeric variable(s)."))
for (j in 1:DPA_LowVarianceForRemoval) {
DPA_LowVarianceRemovedVariable <- rownames(DPA_LowVariance[DPA_LowVariance$nzv,])[j]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LowVarianceRemovedVariable))
}
DPA %>%
skim() %>%
dplyr::filter(skim_variable %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,]))
##################################
# Filtering out columns with low variance
#################################
DPA_ExcludedLowVariance <- DPA[,!names(DPA) %in% rownames(DPA_LowVariance[DPA_LowVariance$nzv,])]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLowVariance_Skimmed <- skim(DPA_ExcludedLowVariance))
}## [1] "Low variance observed for 2 numeric variable(s) with First.Second.Mode.Ratio>4 and Unique.Count.Ratio<0.10."
## [1] "Low variance can be resolved by removing 2 numeric variable(s)."
## [1] "Variable 1 for removal: Iterations"
## [1] "Variable 2 for removal: NumPending"
| Name | DPA_ExcludedLowVariance |
| Number of rows | 4331 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Protocol | 0 | 1 | FALSE | 14 | J: 989, O: 581, N: 536, M: 451 |
| Day | 0 | 1 | FALSE | 7 | Fri: 923, Wed: 903, Tue: 900, Thu: 720 |
| Class | 0 | 1 | FALSE | 4 | VF: 2211, F: 1347, M: 514, L: 259 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Visualizing pairwise correlation between predictors
##################################
DPA_CorrelationTest <- cor.mtest(DPA.Predictors.Numeric,
method = "pearson",
conf.level = .95)
corrplot(cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs"),
method = "circle",
type = "upper",
order = "original",
tl.col = "black",
tl.cex = 0.75,
tl.srt = 90,
sig.level = 0.05,
p.mat = DPA_CorrelationTest$p,
insig = "blank")##################################
# Identifying the highly correlated variables
##################################
DPA_Correlation <- cor(DPA.Predictors.Numeric,
method = "pearson",
use="pairwise.complete.obs")
(DPA_HighlyCorrelatedCount <- sum(abs(DPA_Correlation[upper.tri(DPA_Correlation)]) > 0.95))## [1] 0
if (DPA_HighlyCorrelatedCount == 0) {
print("No highly correlated predictors noted.")
} else {
print(paste0("High correlation observed for ",
(DPA_HighlyCorrelatedCount),
" pairs of numeric variable(s) with Correlation.Coefficient>0.95."))
(DPA_HighlyCorrelatedPairs <- corr_cross(DPA.Predictors.Numeric,
max_pvalue = 0.05,
top = DPA_HighlyCorrelatedCount,
rm.na = TRUE,
grid = FALSE
))
}## [1] "No highly correlated predictors noted."
if (DPA_HighlyCorrelatedCount > 0) {
DPA_HighlyCorrelated <- findCorrelation(DPA_Correlation, cutoff = 0.95)
(DPA_HighlyCorrelatedForRemoval <- length(DPA_HighlyCorrelated))
print(paste0("High correlation can be resolved by removing ",
(DPA_HighlyCorrelatedForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_HighlyCorrelatedForRemoval) {
DPA_HighlyCorrelatedRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_HighlyCorrelated[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_HighlyCorrelatedRemovedVariable))
}
##################################
# Filtering out columns with high correlation
#################################
DPA_ExcludedHighCorrelation <- DPA[,-DPA_HighlyCorrelated]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedHighCorrelation_Skimmed <- skim(DPA_ExcludedHighCorrelation))
}##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Finding linear dependencies
##################################
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
##################################
# Identifying the linearly dependent variables
##################################
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
(DPA_LinearlyDependentCount <- length(DPA_LinearlyDependent$linearCombos))## [1] 0
if (DPA_LinearlyDependentCount == 0) {
print("No linearly dependent predictors noted.")
} else {
print(paste0("Linear dependency observed for ",
(DPA_LinearlyDependentCount),
" subset(s) of numeric variable(s)."))
for (i in 1:DPA_LinearlyDependentCount) {
DPA_LinearlyDependentSubset <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$linearCombos[[i]]]
print(paste0("Linear dependent variable(s) for subset ",
i,
" include: ",
DPA_LinearlyDependentSubset))
}
}## [1] "No linearly dependent predictors noted."
##################################
# Identifying the linearly dependent variables for removal
##################################
if (DPA_LinearlyDependentCount > 0) {
DPA_LinearlyDependent <- findLinearCombos(DPA.Predictors.Numeric)
DPA_LinearlyDependentForRemoval <- length(DPA_LinearlyDependent$remove)
print(paste0("Linear dependency can be resolved by removing ",
(DPA_LinearlyDependentForRemoval),
" numeric variable(s)."))
for (j in 1:DPA_LinearlyDependentForRemoval) {
DPA_LinearlyDependentRemovedVariable <- colnames(DPA.Predictors.Numeric)[DPA_LinearlyDependent$remove[j]]
print(paste0("Variable ",
j,
" for removal: ",
DPA_LinearlyDependentRemovedVariable))
}
##################################
# Filtering out columns with linear dependency
#################################
DPA_ExcludedLinearlyDependent <- DPA.Predictors.Numeric[,-DPA_LinearlyDependent$remove]
##################################
# Gathering descriptive statistics
##################################
(DPA_ExcludedLinearlyDependent_Skimmed <- skim(DPA_ExcludedLinearlyDependent))
}##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))| Name | DPA.Predictors.Numeric |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a center transformation
##################################
DPA_Centered <- preProcess(DPA.Predictors.Numeric, method = c("center"))
DPA_CenteredTransformed <- predict(DPA_Centered, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_CenteredTransformedSkimmed <- skim(DPA_CenteredTransformed))| Name | DPA_CenteredTransformed |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 0 | 1020.17 | -477.74 | -399.74 | -271.74 | -49.74 | 13605.26 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 0 | 3650.08 | -1527.06 | -1403.06 | -1111.06 | -546.06 | 55133.94 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 0 | 34.42 | -19.24 | -9.24 | -9.24 | -9.24 | 170.76 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 0 | 355.96 | -53.39 | -53.39 | -53.39 | -53.39 | 5551.61 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 0 | 3.98 | -13.72 | -2.83 | 0.28 | 2.87 | 10.25 | ▁▂▇▇▁ |
##################################
# Applying a center and scale data transformation
##################################
DPA_CenteredScaled <- preProcess(DPA.Predictors.Numeric, method = c("center","scale"))
DPA_CenteredScaledTransformed <- predict(DPA_CenteredScaled, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_CenteredScaledTransformedSkimmed <- skim(DPA_CenteredScaledTransformed))| Name | DPA_CenteredScaledTransfo… |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 0 | 1 | -0.47 | -0.39 | -0.27 | -0.05 | 13.34 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 0 | 1 | -0.42 | -0.38 | -0.30 | -0.15 | 15.10 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 0 | 1 | -0.56 | -0.27 | -0.27 | -0.27 | 4.96 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 0 | 1 | -0.15 | -0.15 | -0.15 | -0.15 | 15.60 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 0 | 1 | -3.45 | -0.71 | 0.07 | 0.72 | 2.57 | ▁▂▇▇▁ |
##################################
# Applying a range transformation
##################################
DPA_Ranged <- preProcess(DPA.Predictors.Numeric, method = c("range"), rangeBounds = c(0, 1))
DPA_RangedTransformed <- predict(DPA_Ranged, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_RangedTransformedSkimmed <- skim(DPA_RangedTransformed))| Name | DPA_RangedTransformed |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 0.03 | 0.07 | 0 | 0.01 | 0.01 | 0.03 | 1 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 0.03 | 0.06 | 0 | 0.00 | 0.01 | 0.02 | 1 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 0.10 | 0.18 | 0 | 0.05 | 0.05 | 0.05 | 1 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 0.01 | 0.06 | 0 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 0.57 | 0.17 | 0 | 0.45 | 0.58 | 0.69 | 1 | ▁▂▇▇▁ |
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
##################################
# Gathering descriptive statistics
##################################
(DPA_Skimmed <- skim(DPA.Predictors.Numeric))| Name | DPA.Predictors.Numeric |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
##################################
# Applying a Box-Cox transformation
##################################
DPA_BoxCox <- preProcess(DPA.Predictors.Numeric, method = c("BoxCox"))
DPA_BoxCoxTransformed <- predict(DPA_BoxCox, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_BoxCoxTransformedSkimmed <- skim(DPA_BoxCoxTransformed))| Name | DPA_BoxCoxTransformed |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 5.36 | 1.25 | 3.00 | 4.58 | 5.42 | 6.10 | 9.55 | ▅▇▇▂▁ |
| InputFields | 0 | 1 | 5.98 | 1.65 | 2.30 | 4.90 | 6.05 | 6.90 | 10.95 | ▂▆▇▃▁ |
| Iterations | 0 | 1 | 0.95 | 0.02 | 0.90 | 0.95 | 0.95 | 0.95 | 1.00 | ▁▁▇▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.00 | 0.00 | 0.00 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 22.83 | 8.41 | -0.77 | 16.40 | 23.04 | 28.89 | 47.09 | ▁▆▇▆▁ |
##################################
# Applying a Yeo-Johnson transformation
##################################
DPA_YeoJohnson <- preProcess(DPA.Predictors.Numeric, method = c("YeoJohnson"))
DPA_YeoJohnsonTransformed <- predict(DPA_YeoJohnson, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_YeoJohnsonTransformedSkimmed <- skim(DPA_YeoJohnsonTransformed))| Name | DPA_YeoJohnsonTransformed |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 4.32 | 0.80 | 2.70 | 3.84 | 4.40 | 4.83 | 6.67 | ▃▅▇▂▁ |
| InputFields | 0 | 1 | 5.40 | 1.35 | 2.31 | 4.53 | 5.49 | 6.17 | 9.19 | ▂▅▇▃▁ |
| Iterations | 0 | 1 | 0.92 | 0.01 | 0.88 | 0.92 | 0.92 | 0.92 | 0.95 | ▁▁▇▁▁ |
| NumPending | 0 | 1 | 0.19 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 0.91 | ▇▁▁▁▂ |
| Hour | 0 | 1 | 33.41 | 12.40 | 0.02 | 23.80 | 33.53 | 42.30 | 70.46 | ▁▆▇▅▁ |
##################################
# Applying an exponential transformation
##################################
DPA_ExpoTrans <- preProcess(DPA.Predictors.Numeric, method = c("expoTrans"))
DPA_ExpoTransTransformed <- predict(DPA_ExpoTrans, DPA.Predictors.Numeric)
##################################
# Gathering descriptive statistics
##################################
(DPA_ExpoTransTransformedSkimmed <- skim(DPA_ExpoTransTransformed))| Name | DPA_ExpoTransTransformed |
| Number of rows | 4331 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.00 | 226.00 | 448.00 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.00 | 426.00 | 991.00 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 12.31 | 2.08 | 7.64 | 12.00 | 12.00 | 12.00 | 17.75 | ▁▁▇▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.00 | 0.00 | 0.00 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 19.04 | 6.86 | 0.02 | 13.76 | 18.98 | 23.83 | 40.95 | ▁▆▇▃▁ |
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all predictors
##################################
DPA.Predictors.Factor <- DPA.Predictors[,sapply(DPA.Predictors, is.factor)]
##################################
# Applying dummy variable creation
##################################
if (length(names(DPA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DPA.Predictors.Factor))),
" factor variables for dummy variable creation."))
DPA_DummyVariables <- dummyVars(Class ~ ., data = DPA)
DPA_DummyVariablesCreated <- predict(DPA_DummyVariables, DPA)
##################################
# Gathering descriptive statistics
##################################
(DPA_DummyVariablesCreatedSkimmed <- skim(DPA_DummyVariablesCreated))
} else {
print("There are no factor variables for dummy variable creation.")
}## [1] "There are 2 factor variables for dummy variable creation."
| Name | DPA_DummyVariablesCreated |
| Number of rows | 4331 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Protocol.A | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.C | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.D | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.E | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.F | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.G | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.H | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.I | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.J | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Protocol.K | 0 | 1 | 0.00 | 0.04 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.L | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.M | 0 | 1 | 0.10 | 0.31 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.N | 0 | 1 | 0.12 | 0.33 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Protocol.O | 0 | 1 | 0.13 | 0.34 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Compounds | 0 | 1 | 497.74 | 1020.17 | 20.00 | 98.0 | 226.00 | 448.0 | 14103.00 | ▇▁▁▁▁ |
| InputFields | 0 | 1 | 1537.06 | 3650.08 | 10.00 | 134.0 | 426.00 | 991.0 | 56671.00 | ▇▁▁▁▁ |
| Iterations | 0 | 1 | 29.24 | 34.42 | 10.00 | 20.0 | 20.00 | 20.0 | 200.00 | ▇▁▁▁▁ |
| NumPending | 0 | 1 | 53.39 | 355.96 | 0.00 | 0.0 | 0.00 | 0.0 | 5605.00 | ▇▁▁▁▁ |
| Hour | 0 | 1 | 13.73 | 3.98 | 0.02 | 10.9 | 14.02 | 16.6 | 23.98 | ▁▂▇▇▁ |
| Day.Mon | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Day.Tue | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Day.Wed | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Day.Thu | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Day.Fri | 0 | 1 | 0.21 | 0.41 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▂ |
| Day.Sat | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
| Day.Sun | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.0 | 0.00 | 0.0 | 1.00 | ▇▁▁▁▁ |
##################################
# Loading dataset
##################################
DPA <- schedulingData
##################################
# Listing all predictors
##################################
DPA.Predictors <- DPA[,!names(DPA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DPA.Predictors.Numeric <- DPA.Predictors[,sapply(DPA.Predictors, is.numeric)]
ncol(DPA.Predictors.Numeric)## [1] 5
##################################
# Converting response variable data type to factor
##################################
DPA$Class <- as.factor(DPA$Class)
length(levels(DPA$Class))## [1] 4
##################################
# Formulating the box plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "box",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))))##################################
# Formulating the strip plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "strip",
jitter = TRUE,
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))))##################################
# Formulating the density plots
##################################
featurePlot(x = DPA.Predictors.Numeric,
y = DPA$Class,
plot = "density",
scales = list(x = list(relation="free", rot = 90),
y = list(relation="free")),
adjust = 1.5,
pch = "|",
layout = c(1, (ncol(DPA.Predictors.Numeric))),
auto.key = list(columns = (length(levels(DPA$Class)))))