##################################
# Loading R libraries
##################################
library(datasets)
library(caret)
library(moments)
library(missCompare)
library(mi)
library(missMDA)
library(pcaMethods)
library(missForest)
library(rms)
library(mice)
library(VIM)
library(misty)
library(Hmisc)
library(Amelia)
library(foreign)
library(RBtest)
##################################
# Defining file paths
##################################
<- file.path("datasets")
DATASETS_PATH
##################################
# Loading source and
# formulating the analysis set
##################################
<- read.spss(file.path("..", DATASETS_PATH, "TBI.sav"),
TBI use.value.labels=F,
to.data.frame=T)
##################################
# Performing a general exploration of the dataset
##################################
dim(TBI)
## [1] 2159 24
summary(TBI)
## trial d.gos d.mort d.unfav
## Length:2159 Min. :1.000 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :4.000 Median :0.000 Median :0.0000
## Mean :3.543 Mean :0.233 Mean :0.3942
## 3rd Qu.:5.000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.000 Max. :1.0000
##
## cause age d.motor d.pupil
## Min. :3.000 Min. :14.00 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:22.00 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :30.00 Median :4.000 Median :1.000
## Mean :4.908 Mean :33.21 Mean :3.991 Mean :1.458
## 3rd Qu.:6.000 3rd Qu.:43.00 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :9.000 Max. :79.00 Max. :6.000 Max. :3.000
## NA's :123
## pupil.i hypoxia hypotens ctclass
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2.000
## Median :1.000 Median :0.0000 Median :0.0000 Median :3.000
## Mean :1.462 Mean :0.2167 Mean :0.1795 Mean :3.281
## 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:5.000
## Max. :3.000 Max. :1.0000 Max. :1.0000 Max. :6.000
## NA's :249 NA's :59 NA's :25
## tsah edh cisterns shift
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.000 Median :0.0000
## Mean :0.4779 Mean :0.1268 Mean :1.455 Mean :0.1987
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :3.000 Max. :1.0000
## NA's :96 NA's :38 NA's :257 NA's :252
## d.sysbpt glucose glucoset ph
## Min. : 60.0 Min. : 0.5556 Min. : 3.000 Min. :6.790
## 1st Qu.:121.1 1st Qu.: 6.7000 1st Qu.: 6.700 1st Qu.:7.320
## Median :131.3 Median : 8.2000 Median : 8.200 Median :7.380
## Mean :131.4 Mean : 8.9078 Mean : 8.863 Mean :7.376
## 3rd Qu.:142.1 3rd Qu.:10.4000 3rd Qu.:10.400 3rd Qu.:7.450
## Max. :207.4 Max. :41.4000 Max. :20.000 Max. :7.660
## NA's :64 NA's :64 NA's :457
## sodium sodiumt hb hbt
## Min. :114.0 Min. :125.0 Min. : 2.50 Min. : 6.00
## 1st Qu.:137.0 1st Qu.:137.0 1st Qu.:10.90 1st Qu.:10.90
## Median :140.0 Median :140.0 Median :12.90 Median :12.90
## Mean :139.3 Mean :139.3 Mean :12.45 Mean :12.47
## 3rd Qu.:142.0 3rd Qu.:142.0 3rd Qu.:14.30 3rd Qu.:14.30
## Max. :154.0 Max. :154.0 Max. :18.60 Max. :17.00
## NA's :62 NA's :62 NA's :24 NA's :24
describe(TBI)
## TBI
##
## 24 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## d.gos
## n missing distinct Info Mean Gmd
## 2159 0 5 0.894 3.543 1.726
##
## Value 1 2 3 4 5
## Frequency 503 86 262 351 957
## Proportion 0.233 0.040 0.121 0.163 0.443
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## d.mort
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.536 503 0.233 0.3576
##
## --------------------------------------------------------------------------------
## d.unfav
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.716 851 0.3942 0.4778
##
## --------------------------------------------------------------------------------
## cause
## n missing distinct Info Mean Gmd
## 2159 0 5 0.921 4.908 2.301
##
## Value 3 4 5 6 9
## Frequency 848 420 134 370 387
## Proportion 0.393 0.195 0.062 0.171 0.179
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## d.motor
## n missing distinct Info Mean Gmd
## 2159 0 6 0.919 3.991 1.226
##
## Value 1 2 3 4 5 6
## Frequency 14 279 369 627 790 80
## Proportion 0.006 0.129 0.171 0.290 0.366 0.037
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## d.pupil
## n missing distinct Info Mean Gmd
## 2036 123 3 0.647 1.458 0.6881
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## pupil.i
## n missing distinct Info Mean Gmd
## 2159 0 3 0.65 1.462 0.6915
##
## Value 1 2 3
## Frequency 1511 299 349
## Proportion 0.700 0.138 0.162
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct Info Sum Mean Gmd
## 1910 249 2 0.509 414 0.2168 0.3397
##
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct Info Sum Mean Gmd
## 2100 59 2 0.442 377 0.1795 0.2947
##
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct Info Mean Gmd
## 2134 25 6 0.928 3.281 1.689
##
## Value 1 2 3 4 5 6
## Frequency 149 781 414 85 521 184
## Proportion 0.070 0.366 0.194 0.040 0.244 0.086
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## tsah
## n missing distinct Info Sum Mean Gmd
## 2063 96 2 0.749 986 0.4779 0.4993
##
## --------------------------------------------------------------------------------
## edh
## n missing distinct Info Sum Mean Gmd
## 2121 38 2 0.332 269 0.1268 0.2216
##
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct Info Mean Gmd
## 1902 257 3 0.716 1.455 0.6367
##
## Value 1 2 3
## Frequency 1223 492 187
## Proportion 0.643 0.259 0.098
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## shift
## n missing distinct Info Sum Mean Gmd
## 1907 252 2 0.478 379 0.1987 0.3187
##
## --------------------------------------------------------------------------------
## d.sysbpt
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60 64.19 65.6 71.33 73.71 , highest: 182.92 184.16 184.7 188.29 207.4
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 431 1 8.908 3.56 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 0.555556 1 1.03 1.04 1.22
## highest: 29.2222 30.8889 31.1111 32.5556 41.4
## --------------------------------------------------------------------------------
## glucoset
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3 3.03 3.05556 3.2 3.44444
## highest: 19.5 19.5556 19.7778 19.8 20
## --------------------------------------------------------------------------------
## ph
## n missing distinct Info Mean Gmd .05 .10
## 1702 457 112 0.999 7.376 0.1183 7.19 7.24
## .25 .50 .75 .90 .95
## 7.32 7.38 7.45 7.50 7.54
##
## lowest : 6.79 6.83 6.87 6.9 6.91, highest: 7.62 7.63 7.64 7.65 7.66
## --------------------------------------------------------------------------------
## sodium
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 62 0.993 139.3 4.448 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 114 117 118 121 123 , highest: 150 151 151.7 153 154
## --------------------------------------------------------------------------------
## sodiumt
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 55 0.993 139.3 4.408 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 125 126 126.5 127 127.4, highest: 150 151 151.7 153 154
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 156 1 12.45 2.846 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 2.5 2.8 3.4 3.6 4 , highest: 17.3 17.4 17.6 17.7 18.6
## --------------------------------------------------------------------------------
## hbt
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6 6.1 6.2 6.3 6.4 , highest: 16.6 16.7 16.8 16.9 17
## --------------------------------------------------------------------------------
##################################
# Performing re-categorization and re-grouping
# of factor variables
##################################
$pupil <- ifelse(TBI$d.pupil==9,NA,TBI$d.pupil)
TBI
$motor <- ifelse(is.na(TBI$d.motor),9,TBI$d.motor)
TBI$motor1 <- ifelse(TBI$motor==1,1,0)
TBI$motor2 <- ifelse(TBI$motor==2,1,0)
TBI$motor3 <- ifelse(TBI$motor==3,1,0)
TBI$motor4 <- ifelse(TBI$motor==4,1,0)
TBI$motor56 <- ifelse(TBI$motor==5 | TBI$motor==6,1,0)
TBI$motor9 <- ifelse(TBI$motor==9,1,0)
TBI$motorG <- ifelse(TBI$motor1==1,5,
TBIifelse(TBI$motor2==1,4,
ifelse(TBI$motor3==1,3,
ifelse(TBI$motor4==1,2,
ifelse(TBI$motor56==1,1,
ifelse(TBI$motor9==1,9,NA))))))
$motorG <- as.factor(TBI$motorG)
TBI
$mort <- ifelse(TBI$d.gos==1,1,ifelse(TBI$d.gos==6,NA,0))
TBI$deadveg <- ifelse(TBI$d.gos<3,1,ifelse(TBI$d.gos==6,NA,0))
TBI$unfav <- ifelse(TBI$d.gos<4,1,ifelse(TBI$d.gos==6,NA,0))
TBI$good <- ifelse(TBI$d.gos<5,1,ifelse(TBI$d.gos==6,NA,0))
TBI
$ctclass2 <- ifelse(TBI$ctclass==2,1,0)
TBI$ctclass1 <- ifelse(TBI$ctclass==1,1,0)
TBI$ctclass34 <- ifelse(TBI$ctclass==3 | TBI$ctclass==4,1,0)
TBI$ctclass56 <- ifelse(TBI$ctclass==5 | TBI$ctclass==6,1,0)
TBI$ctclassr4 <- ifelse(TBI$ctclass1==1,2,
TBIifelse(TBI$ctclass2==1,1,
ifelse(TBI$ctclass34==1,3,
ifelse(TBI$ctclass56==1,4,NA))))
$ctclassr4 <- as.factor(TBI$ctclassr4)
TBI$ctclassr3 <- ifelse(TBI$ctclass1==1 | TBI$ctclass2==1,1,
TBIifelse(TBI$ctclass34==1,2,
ifelse(TBI$ctclass56==1,3,NA)))
$ctclassr3 <- as.factor(TBI$ctclassr3)
TBI
<- TBI[,Cs(trial,
TBI.Analysis
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motorG,
ctclassr3,
d.sysbpt,
hbt,
glucoset,
unfav,
mort)]
names(TBI.Analysis) <- Cs(trial,
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motor,
ctclass,
d.sysbp,
hb,
glucose,
unfav,
mort )$cisterns <- ifelse(TBI.Analysis$cisterns>1,1,0)
TBI.Analysis
$trial <- as.factor(TBI.Analysis$trial)
TBI.Analysis$age <- as.numeric(TBI.Analysis$age)
TBI.Analysis$hypoxia <-as.factor(TBI.Analysis$hypoxia)
TBI.Analysis$hypotens <-as.factor(TBI.Analysis$hypotens)
TBI.Analysis$cisterns <-as.factor(TBI.Analysis$cisterns)
TBI.Analysis$shift <-as.factor(TBI.Analysis$shift)
TBI.Analysis$tsah <-as.factor(TBI.Analysis$tsah)
TBI.Analysis$edh <-as.factor(TBI.Analysis$edh)
TBI.Analysis$pupil <-as.factor(TBI.Analysis$pupil)
TBI.Analysis$motor <-as.factor(TBI.Analysis$motor)
TBI.Analysis$ctclass <-as.factor(TBI.Analysis$ctclass)
TBI.Analysis$d.sysbp <-as.numeric(TBI.Analysis$d.sysbp)
TBI.Analysis$hb <-as.numeric(TBI.Analysis$hb)
TBI.Analysis$glucose <-as.numeric(TBI.Analysis$glucose)
TBI.Analysis$unfav <-as.factor(TBI.Analysis$unfav)
TBI.Analysis$mort <-as.factor(TBI.Analysis$mort)
TBI.Analysis
dim(TBI.Analysis)
## [1] 2159 16
describe(TBI.Analysis)
## TBI.Analysis
##
## 16 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct
## 1910 249 2
##
## Value 0 1
## Frequency 1496 414
## Proportion 0.783 0.217
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct
## 2100 59 2
##
## Value 0 1
## Frequency 1723 377
## Proportion 0.82 0.18
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct
## 1902 257 2
##
## Value 0 1
## Frequency 1223 679
## Proportion 0.643 0.357
## --------------------------------------------------------------------------------
## shift
## n missing distinct
## 1907 252 2
##
## Value 0 1
## Frequency 1528 379
## Proportion 0.801 0.199
## --------------------------------------------------------------------------------
## tsah
## n missing distinct
## 2063 96 2
##
## Value 0 1
## Frequency 1077 986
## Proportion 0.522 0.478
## --------------------------------------------------------------------------------
## edh
## n missing distinct
## 2121 38 2
##
## Value 0 1
## Frequency 1852 269
## Proportion 0.873 0.127
## --------------------------------------------------------------------------------
## pupil
## n missing distinct
## 2036 123 3
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
## --------------------------------------------------------------------------------
## motor
## n missing distinct
## 2159 0 5
##
## Value 1 2 3 4 5
## Frequency 870 627 369 279 14
## Proportion 0.403 0.290 0.171 0.129 0.006
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct
## 2134 25 3
##
## Value 1 2 3
## Frequency 930 499 705
## Proportion 0.436 0.234 0.330
## --------------------------------------------------------------------------------
## d.sysbp
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60 64.19 65.6 71.33 73.71 , highest: 182.92 184.16 184.7 188.29 207.4
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6 6.1 6.2 6.3 6.4 , highest: 16.6 16.7 16.8 16.9 17
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3 3.03 3.05556 3.2 3.44444
## highest: 19.5 19.5556 19.7778 19.8 20
## --------------------------------------------------------------------------------
## unfav
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1308 851
## Proportion 0.606 0.394
## --------------------------------------------------------------------------------
## mort
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1656 503
## Proportion 0.767 0.233
## --------------------------------------------------------------------------------
summary(TBI.Analysis)
## trial age hypoxia hypotens cisterns shift
## 74:1118 Min. :14.00 0 :1496 0 :1723 0 :1223 0 :1528
## 75:1041 1st Qu.:22.00 1 : 414 1 : 377 1 : 679 1 : 379
## Median :30.00 NA's: 249 NA's: 59 NA's: 257 NA's: 252
## Mean :33.21
## 3rd Qu.:43.00
## Max. :79.00
##
## tsah edh pupil motor ctclass d.sysbp
## 0 :1077 0 :1852 1 :1430 1:870 1 :930 Min. : 60.0
## 1 : 986 1 : 269 2 : 279 2:627 2 :499 1st Qu.:121.1
## NA's: 96 NA's: 38 3 : 327 3:369 3 :705 Median :131.3
## NA's: 123 4:279 NA's: 25 Mean :131.4
## 5: 14 3rd Qu.:142.1
## Max. :207.4
##
## hb glucose unfav mort
## Min. : 6.00 Min. : 3.000 0:1308 0:1656
## 1st Qu.:10.90 1st Qu.: 6.700 1: 851 1: 503
## Median :12.90 Median : 8.200
## Mean :12.47 Mean : 8.863
## 3rd Qu.:14.30 3rd Qu.:10.400
## Max. :17.00 Max. :20.000
## NA's :24 NA's :64
##################################
# Formulating a data type assessment summary
##################################
<- TBI.Analysis
PDA <- data.frame(
(PDA.Summary Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 trial factor
## 2 2 age numeric
## 3 3 hypoxia factor
## 4 4 hypotens factor
## 5 5 cisterns factor
## 6 6 shift factor
## 7 7 tsah factor
## 8 8 edh factor
## 9 9 pupil factor
## 10 10 motor factor
## 11 11 ctclass factor
## 12 12 d.sysbp numeric
## 13 13 hb numeric
## 14 14 glucose numeric
## 15 15 unfav factor
## 16 16 mort factor
##################################
# Loading dataset
##################################
<- TBI.Analysis
DQA
##################################
# Listing all predictors
##################################
<- DQA[,names(DQA)]
DQA.Predictors
##################################
# Formulating an overall data quality assessment summary
##################################
<- data.frame(
(DQA.Summary Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 trial factor 2159 0 1.000
## 2 2 age numeric 2159 0 1.000
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 10 10 motor factor 2159 0 1.000
## 11 11 ctclass factor 2159 25 0.988
## 12 12 d.sysbp numeric 2159 0 1.000
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
## 15 15 unfav factor 2159 0 1.000
## 16 16 mort factor 2159 0 1.000
##################################
# Listing all numeric predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
DQA.Predictors.Numeric
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Numeric))),
(" numeric predictor variable(s)."))
else {
} print("There are no numeric predictor variables.")
}
## [1] "There are 4 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
<- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
DQA.Predictors.Factor
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
length(names(DQA.Predictors.Factor))),
(" factor predictor variable(s)."))
else {
} print("There are no factor predictor variables.")
}
## [1] "There are 12 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = x[!(x %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm == max(tabsm)]
usm[tabsm
}
<- data.frame(
(DQA.Predictors.Factor.Summary Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 trial factor 2 74 75
## 2 hypoxia factor 3 0 1
## 3 hypotens factor 3 0 1
## 4 cisterns factor 3 0 1
## 5 shift factor 3 0 1
## 6 tsah factor 3 0 1
## 7 edh factor 3 0 1
## 8 pupil factor 4 1 3
## 9 motor factor 5 1 2
## 10 ctclass factor 4 1 3
## 11 unfav factor 2 0 1
## 12 mort factor 2 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio
## 1 1118 1041 0.001
## 2 1496 414 0.001
## 3 1723 377 0.001
## 4 1223 679 0.001
## 5 1528 379 0.001
## 6 1077 986 0.001
## 7 1852 269 0.001
## 8 1430 327 0.002
## 9 870 627 0.002
## 10 930 705 0.002
## 11 1308 851 0.001
## 12 1656 503 0.001
## First.Second.Mode.Ratio
## 1 1.074
## 2 3.614
## 3 4.570
## 4 1.801
## 5 4.032
## 6 1.092
## 7 6.885
## 8 4.373
## 9 1.388
## 10 1.319
## 11 1.537
## 12 3.292
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
<- function(x) {
FirstModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
##################################
# Formulating a function to determine the second mode
##################################
<- function(x) {
SecondModes <- unique(na.omit(x))
ux <- tabulate(match(x, ux))
tab = ux[tab == max(tab)]
fm = na.omit(x)[!(na.omit(x) %in% fm)]
sm <- unique(sm)
usm <- tabulate(match(sm, usm))
tabsm == max(tabsm)]
usm[tabsm
}
<- data.frame(
(DQA.Predictors.Numeric.Summary Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 age numeric 64 0.030 21.000
## 2 d.sysbp numeric 1566 0.725 130.000
## 3 hb numeric 131 0.061 13.900
## 4 glucose numeric 382 0.177 20.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 19.000 104 100 1.040
## 2 120.000 9 8 1.125
## 3 13.600 47 43 1.093
## 4 6.500 29 27 1.074
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 14.000 33.211 30.000 79.000 0.721 2.585 22.000
## 2 60.000 131.430 131.280 207.400 -0.022 3.982 121.085
## 3 6.000 12.468 12.900 17.000 -0.619 2.786 10.900
## 4 3.000 8.863 8.200 20.000 1.192 4.727 6.700
## Percentile75th
## 1 43.000
## 2 142.060
## 3 14.300
## 4 10.400
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
(" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
$NA.Count>0,]
DQA.Summary[DQA.Summaryelse {
} print("No missing observations noted.")
}
## [1] "Missing observations noted for 10 variable(s) with NA.Count>0 and Fill.Rate<1.0."
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 11 11 ctclass factor 2159 25 0.988
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
(" factor variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Factor.Summary[else {
} print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 1 factor variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 7 edh factor 3 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio First.Second.Mode.Ratio
## 7 1852 269 0.001 6.885
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
(" numeric variable(s) with First.Second.Mode.Ratio>5."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
} print(paste0("Low variance observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
(" numeric variable(s) with Unique.Count.Ratio<0.01."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
DQA.Predictors.Numeric.Summary[else {
} print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
} as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
(as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
else {
} print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDPA
##################################
# Reconstructing the dataset by :
# removing variables with fill rate > 0.50
# removing observations with NA ratio > 0.80
# converting factor variables to numeric variables
##################################
<- missCompare::clean(MDPA,
MDPA.Cleaned var_removal_threshold=0.50,
ind_removal_threshold=0.80)
##################################
# Gathering the metadata using the reconstructed dataset
##################################
<- missCompare::get_data(MDPA.Cleaned,
MDPA.Cleaned.Metadata matrixplot_sort = T,
plot_transform = T)
##################################
# Gathering the number of complete cases
##################################
$Complete_cases MDPA.Cleaned.Metadata
## [1] 1431
$Rows MDPA.Cleaned.Metadata
## [1] 2159
$Columns MDPA.Cleaned.Metadata
## [1] 16
$Complete_cases/MDPA.Cleaned.Metadata$Rows MDPA.Cleaned.Metadata
## [1] 0.6628069
##################################
# Gathering the number of incomplete cases
##################################
$Total_NA MDPA.Cleaned.Metadata
## [1] 1187
$NA_per_variable MDPA.Cleaned.Metadata
## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
$Fraction_missingness MDPA.Cleaned.Metadata
## [1] 0.03436197
$Fraction_missingness_per_variable MDPA.Cleaned.Metadata
## trial age hypoxia hypotens cisterns shift tsah
## 0.00000000 0.00000000 0.11533117 0.02732747 0.11903659 0.11672070 0.04446503
## edh pupil motor ctclass d.sysbp hb glucose
## 0.01760074 0.05697082 0.00000000 0.01157943 0.00000000 0.01111626 0.02964335
## unfav mort
## 0.00000000 0.00000000
##################################
# Gathering the missing data patterns
##################################
$Matrix_plot MDPA.Cleaned.Metadata
$Cluster_plot MDPA.Cleaned.Metadata
$Corr_matrix MDPA.Cleaned.Metadata
## trial age hypoxia hypotens cisterns
## trial 1.000000000 -0.030651416 1.656750e-01 0.1050912318 0.0090874012
## age -0.030651416 1.000000000 1.031147e-03 0.0797683643 0.0513284240
## hypoxia 0.165675039 0.001031147 1.000000e+00 0.2323622569 0.0460174467
## hypotens 0.105091232 0.079768364 2.323623e-01 1.0000000000 0.0004308286
## cisterns 0.009087401 0.051328424 4.601745e-02 0.0004308286 1.0000000000
## shift -0.065358546 0.104812735 4.487131e-20 -0.0549581210 0.4585598379
## tsah -0.097721950 0.174014741 1.627862e-02 -0.0075673746 0.0329649255
## edh -0.119425686 0.002670862 -4.682238e-02 -0.0770647036 0.0237954129
## pupil 0.088381902 0.016622292 1.438070e-01 0.1137020246 0.1464712963
## motor -0.051699973 -0.008694701 1.885604e-01 0.0656417934 0.1168144834
## ctclass -0.015770576 0.153012007 1.850571e-02 -0.0194182715 0.2788307306
## d.sysbp 0.087643656 0.012554957 -1.094494e-01 -0.2634324171 -0.0114882127
## hb 0.227845969 -0.031832004 -5.386741e-02 -0.2705358063 -0.0417687110
## glucose 0.090165100 0.069076332 1.619970e-01 0.1994708499 0.0498702817
## unfav -0.029068950 0.208375143 1.742552e-01 0.1776370570 0.1396093897
## mort -0.038439923 0.153665440 1.414791e-01 0.1666659119 0.1424181948
## shift tsah edh pupil motor
## trial -6.535855e-02 -0.097721950 -0.119425686 0.08838190 -0.051699973
## age 1.048127e-01 0.174014741 0.002670862 0.01662229 -0.008694701
## hypoxia 4.487131e-20 0.016278621 -0.046822378 0.14380701 0.188560393
## hypotens -5.495812e-02 -0.007567375 -0.077064704 0.11370202 0.065641793
## cisterns 4.585598e-01 0.032964926 0.023795413 0.14647130 0.116814483
## shift 1.000000e+00 0.090218463 0.041278708 0.09772835 0.088389352
## tsah 9.021846e-02 1.000000000 -0.010786454 0.14788861 0.116778650
## edh 4.127871e-02 -0.010786454 1.000000000 -0.02558442 -0.046758976
## pupil 9.772835e-02 0.147888607 -0.025584420 1.00000000 0.381055618
## motor 8.838935e-02 0.116778650 -0.046758976 0.38105562 1.000000000
## ctclass 2.793684e-01 0.191225179 0.296787667 0.21786366 0.167952008
## d.sysbp 9.615297e-03 -0.022296366 -0.015739545 -0.08779228 -0.123171876
## hb -3.378804e-02 0.030632398 -0.019653199 -0.08586573 -0.064731488
## glucose 2.573764e-02 0.102781495 -0.001029329 0.18136723 0.128402858
## unfav 1.340827e-01 0.249226529 -0.065700956 0.31905926 0.353979268
## mort 1.612055e-01 0.229655718 -0.061375715 0.25297999 0.267972519
## ctclass d.sysbp hb glucose unfav
## trial -0.015770576 0.087643656 0.227845969 0.090165100 -0.02906895
## age 0.153012007 0.012554957 -0.031832004 0.069076332 0.20837514
## hypoxia 0.018505712 -0.109449414 -0.053867408 0.161996976 0.17425522
## hypotens -0.019418271 -0.263432417 -0.270535806 0.199470850 0.17763706
## cisterns 0.278830731 -0.011488213 -0.041768711 0.049870282 0.13960939
## shift 0.279368362 0.009615297 -0.033788043 0.025737640 0.13408269
## tsah 0.191225179 -0.022296366 0.030632398 0.102781495 0.24922653
## edh 0.296787667 -0.015739545 -0.019653199 -0.001029329 -0.06570096
## pupil 0.217863665 -0.087792278 -0.085865732 0.181367225 0.31905926
## motor 0.167952008 -0.123171876 -0.064731488 0.128402858 0.35397927
## ctclass 1.000000000 -0.015114873 -0.000441974 0.123201094 0.24639544
## d.sysbp -0.015114873 1.000000000 0.195736194 -0.125362570 -0.09119451
## hb -0.000441974 0.195736194 1.000000000 -0.134613174 -0.15508193
## glucose 0.123201094 -0.125362570 -0.134613174 1.000000000 0.22878822
## unfav 0.246395438 -0.091194506 -0.155081925 0.228788217 1.00000000
## mort 0.222675184 -0.178138387 -0.162844394 0.229159186 0.68327089
## mort
## trial -0.03843992
## age 0.15366544
## hypoxia 0.14147910
## hypotens 0.16666591
## cisterns 0.14241819
## shift 0.16120554
## tsah 0.22965572
## edh -0.06137571
## pupil 0.25297999
## motor 0.26797252
## ctclass 0.22267518
## d.sysbp -0.17813839
## hb -0.16284439
## glucose 0.22915919
## unfav 0.68327089
## mort 1.00000000
$MD_Pattern MDPA.Cleaned.Metadata
## trial age hypoxia hypotens cisterns shift tsah edh pupil motor ctclass
## 1431 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1 1
## 200 1 1 1 1 0 0 1 1 1 1 1
## 157 1 1 0 1 1 1 1 1 1 1 1
## 12 1 1 0 1 0 0 1 1 1 1 1
## 82 1 1 1 1 1 1 1 1 0 1 1
## 12 1 1 1 1 0 0 1 1 0 1 1
## 10 1 1 0 1 1 1 1 1 0 1 1
## 1 1 1 0 1 0 1 1 1 0 1 1
## 42 1 1 1 1 1 1 0 1 1 1 1
## 9 1 1 1 1 0 0 0 1 1 1 1
## 11 1 1 0 1 1 1 0 1 1 1 1
## 5 1 1 1 1 1 1 0 1 0 1 1
## 34 1 1 1 1 1 1 1 1 1 1 1
## 8 1 1 0 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 0 1 1
## 2 1 1 1 1 1 1 0 1 1 1 1
## 10 1 1 1 0 1 1 1 1 1 1 1
## 27 1 1 0 0 1 1 1 1 1 1 1
## 2 1 1 0 0 0 0 1 1 1 1 1
## 3 1 1 0 0 1 1 1 1 0 1 1
## 2 1 1 1 0 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 0 1 1 1 1
## 3 1 1 1 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 0 1 1 1 1
## 7 1 1 1 1 1 1 1 0 1 1 1
## 3 1 1 1 1 0 0 1 0 1 1 1
## 3 1 1 0 1 1 1 1 0 1 1 1
## 1 1 1 1 1 1 1 1 0 0 1 1
## 1 1 1 0 1 1 1 1 0 0 1 1
## 10 1 1 1 1 1 1 0 0 1 1 1
## 4 1 1 1 1 0 0 0 0 1 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 1 1 1 0 0 0 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 0 1 1 1 0 1 1 1
## 1 1 1 0 0 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 0 0 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 0
## 2 1 1 1 1 0 0 1 1 1 1 0
## 4 1 1 0 1 1 1 1 1 1 1 0
## 3 1 1 1 1 1 1 1 1 0 1 0
## 1 1 1 1 0 1 1 1 1 1 1 0
## 1 1 1 1 0 0 0 1 1 1 1 0
## 1 1 1 1 1 1 1 1 0 1 1 0
## 2 1 1 1 1 1 1 0 0 1 1 0
## 5 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1
## 7 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 0 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 0 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 1 0 1 1 0 1 1 1 1
## 0 0 249 59 257 252 96 38 123 0 25
## d.sysbp hb glucose unfav mort
## 1431 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 200 1 1 1 1 1
## 157 1 1 1 1 1
## 12 1 1 1 1 1
## 82 1 1 1 1 1
## 12 1 1 1 1 1
## 10 1 1 1 1 1
## 1 1 1 1 1 1
## 42 1 1 1 1 1
## 9 1 1 1 1 1
## 11 1 1 1 1 1
## 5 1 1 1 1 1
## 34 1 1 0 1 1
## 8 1 1 0 1 1
## 2 1 1 0 1 1
## 2 1 1 0 1 1
## 10 1 1 1 1 1
## 27 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 0 1 1
## 1 1 1 0 1 1
## 1 1 1 0 1 1
## 7 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 10 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 0 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 11 1 1 1 1 1
## 2 1 1 1 1 1
## 4 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 0 1 1 1
## 4 1 0 1 1 1
## 1 1 0 1 1 1
## 7 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 2 1 0 1 1 1
## 1 1 0 0 1 1
## 0 24 64 0 0
dim(MDPA.Cleaned.Metadata$MD_Pattern)
## [1] 59 16
$NA_Correlations MDPA.Cleaned.Metadata
## trial age hypoxia hypotens
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.003075342 -1.991261e-02 NaN -0.050871031
## hypotens_is.na -0.003139022 9.723747e-03 -0.070715627 NaN
## cisterns_is.na -0.020274316 5.403308e-03 -0.036463860 0.021305289
## shift_is.na -0.012972969 2.893106e-03 -0.040060700 0.018044005
## tsah_is.na -0.048171238 1.924787e-02 -0.067614722 -0.098680649
## edh_is.na 0.023420299 5.196470e-03 -0.043045790 -0.065097712
## pupil_is.na 0.037218347 -4.342998e-02 -0.030756342 -0.018479920
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na -0.008194874 -8.522036e-03 -0.005461126 -0.046146806
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.039147773 2.245985e-05 -0.029832037 -0.002868117
## glucose_is.na 0.092140222 1.110012e-03 -0.005690478 0.003122214
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## cisterns shift tsah edh
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.005132467 -0.037162241 -8.090368e-03 0.0209094526
## hypotens_is.na 0.019420226 -0.014558767 -9.092611e-04 0.0009040846
## cisterns_is.na NaN 0.027979664 5.895326e-02 -0.0364386527
## shift_is.na -0.030781323 NaN 4.920710e-02 -0.0351378312
## tsah_is.na 0.035616071 -0.016129896 NaN 0.0201143139
## edh_is.na 0.017911716 0.012064215 3.759542e-02 NaN
## pupil_is.na 0.032634364 -0.017689375 -2.970642e-02 0.0320063995
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na 0.008762896 0.004581525 -6.719949e-05 0.0390176196
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.029175385 0.007846099 4.863460e-03 -0.0128075625
## glucose_is.na -0.042431688 -0.027251502 -1.925087e-02 -0.0167735791
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## pupil motor ctclass d.sysbp hb
## trial_is.na NaN NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN NaN
## hypoxia_is.na -0.0383020501 -0.077961939 -0.018323675 0.06582073 0.02059243
## hypotens_is.na -0.0252221782 -0.029960226 -0.013411691 0.02261856 -0.02056744
## cisterns_is.na 0.0268112302 0.111893054 -0.006296335 -0.06654302 0.11436064
## shift_is.na 0.0192540664 0.105806117 -0.017223821 -0.06403836 0.12099795
## tsah_is.na -0.0308960246 -0.041118410 0.002862034 0.07287370 0.06924227
## edh_is.na -0.0298496078 -0.023869160 0.018298843 0.04278551 0.06563305
## pupil_is.na NaN -0.028658751 -0.020254575 0.02318343 0.03887609
## motor_is.na NaN NaN NaN NaN NaN
## ctclass_is.na -0.0750312122 -0.023662963 NaN 0.01626310 0.06052439
## d.sysbp_is.na NaN NaN NaN NaN NaN
## hb_is.na 0.0005132771 -0.003710246 -0.012940904 0.02107206 NaN
## glucose_is.na 0.0150930949 -0.020565988 -0.049791976 0.04466416 0.01725871
## unfav_is.na NaN NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN NaN
## glucose unfav mort
## trial_is.na NaN NaN NaN
## age_is.na NaN NaN NaN
## hypoxia_is.na -0.004860708 -0.002531656 -0.02054161
## hypotens_is.na 0.011223556 0.013113682 0.01173212
## cisterns_is.na -0.046590315 0.033074703 0.03341347
## shift_is.na -0.045520597 0.024587698 0.02631154
## tsah_is.na -0.088678630 -0.055916708 -0.04589547
## edh_is.na -0.036218129 -0.021781268 -0.02622100
## pupil_is.na 0.010353762 -0.018474498 -0.02998735
## motor_is.na NaN NaN NaN
## ctclass_is.na -0.074101567 -0.063310212 -0.07348964
## d.sysbp_is.na NaN NaN NaN
## hb_is.na -0.008761202 -0.022962768 -0.03562051
## glucose_is.na NaN -0.015499980 -0.01349823
## unfav_is.na NaN NaN NaN
## mort_is.na NaN NaN NaN
$NA_Correlation_plot MDPA.Cleaned.Metadata
$min_PDM_thresholds MDPA.Cleaned.Metadata
## min_PDM_threshold perc_obs_retained
## 1 5 89
## 2 10 81
## 3 20 74
## 4 50 60
## 5 100 49
## 6 200 0
## 7 500 0
## 8 1000 0
$Vars_above_half MDPA.Cleaned.Metadata
## character(0)
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDPA
##################################
# Testing the missingness mechanism using a regression-based approach
# MCAR (Missing Completely at Random) versus MAR (Missing at Random)
# for variables with missing data as evaluated against variables with complete observations
##################################
RBtest(TBI.Analysis)
## $abs.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
##
## $rel.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0.00 0.00 0.12 0.03 0.12 0.12 0.04 0.02
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0.06 0.00 0.01 0.00 0.01 0.03 0.00 0.00
##
## $type
## trial age hypoxia hypotens cisterns shift tsah edh
## -1 -1 1 0 1 0 0 0
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0 -1 1 -1 0 0 -1 -1
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDIE
##################################
# Reconstructing the dataset by :
# removing variables with fill rate > 0.50
# removing observations with NA ratio > 0.80
# converting factor variables to numeric variables
##################################
<- missCompare::clean(MDIE,
MDIE.Cleaned var_removal_threshold=0.50,
ind_removal_threshold=0.80)
##################################
# Gathering the metadata using the reconstructed dataset
##################################
<- missCompare::get_data(MDIE.Cleaned,
MDIE.Cleaned.Metadata matrixplot_sort = T,
plot_transform = T)
##################################
# Simulating a dataset using the metadata characteristics
# applying MCAR, MAR and MNAR mechanisms
##################################
<- missCompare::simulate(rownum=MDIE.Cleaned.Metadata$Rows,
MDIE.Simulated colnum=MDIE.Cleaned.Metadata$Columns,
cormat=MDIE.Cleaned.Metadata$Corr_matrix,
meanval=0,
sdval=1)
##################################
# Performing 16 imputation methods
# on the simulated dataset
# Method 1 : random replacement
# Method 2 : median imputation
# Method 3 : mean imputation
# Method 4 : missMDA Regularized
# Method 5 : missMDA EM
# Method 6 : pcaMethods PPCA
# Method 7 : pcaMethods svdImpute
# Method 8 : pcaMethods BPCA
# Method 9 : pcaMethods NIPALS
# Method 10 : pcaMethods NLPCA
# Method 11 : mice mixed
# Method 12 : mi Bayesian
# Method 13 : Amelia II
# Method 14 : missForest
# Method 15 : Hmisc aregImpute
# Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Minimum number of observations per distinct missing data pattern = 10
# Number of iterations = 1 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::impute_simulated(rownum=MDIE.Cleaned.Metadata$Rows,
MDIE.Simulated.Imputed colnum=MDIE.Cleaned.Metadata$Columns,
cormat=MDIE.Cleaned.Metadata$Corr_matrix,
MD_pattern=MDIE.Cleaned.Metadata$MD_Pattern,
NA_fraction=MDIE.Cleaned.Metadata$Fraction_missingness,
min_PDM=10,
n.iter=1,
assumed_pattern=NA)
## [1] "ITERATION 1 OF TOTAL 1 - IN PROGRESS"
## [1] "random replacement imputation - in progress"
## [1] "Median imputation - in progress"
## [1] "Mean imputation - in progress"
## [1] "missMDA regularized imputation - in progress"
## [1] "missMDA EM imputation - in progress"
## [1] "pcaMethods PPCA imputation - in progress"
## [1] "pcaMethods svdImpute imputation - in progress"
## [1] "pcaMethods BPCA imputation - in progress"
## [1] "pcaMethods NIPALS imputation - in progress"
## [1] "pcaMethods NLPCA imputation - in progress"
## [1] "mice mixed imputation - in progress"
## [1] "mi imputation - in progress"
## [1] "Amelia II imputation - in progress"
## [1] "missForest imputation - in progress"
## [1] "Hmisc aregImpute imputation - in progress"
## [1] "VIM kNN imputation - in progress"
##################################
# Comparing the performance of the 16 imputation methods
# in terms of the process execution time
# per missing data mechanism
##################################
$Plot_TIME MDIE.Simulated.Imputed
##################################
# Comparing the performance of the 16 imputation methods
# in terms of the root mean square error
# per missing data mechanism
##################################
$Plot_RMSE MDIE.Simulated.Imputed
##################################
# Comparing the performance of the 16 imputation methods
# in terms of the mean absolute error
# per missing data mechanism
##################################
$Plot_MAE MDIE.Simulated.Imputed
##################################
# Comparing the performance of the 16 imputation methods
# in terms of the kolmogorov smirnov statistic
# per missing data mechanism
##################################
$Plot_KS MDIE.Simulated.Imputed
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDIPID
##################################
# Performing missing data imputation
# on the original dataset using
# Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::impute_data(MDIPID,
MDIPID.Imputed.MiceMixed scale=F,
n.iter=3,
sel_method=c(11))
## [1] "mice mixed imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.MiceMixed.Diagnostics $mice_mixed_imputation[[1]],
MDIPID.Imputed.MiceMixedscale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
$Histograms MDIPID.Imputed.MiceMixed.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
$Boxplots MDIPID.Imputed.MiceMixed.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
$Statistics MDIPID.Imputed.MiceMixed.Diagnostics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.46761124 2.50332935 13.32916667 1.77040383 0.02686154
## KS_test.D
## 0.19814598
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.86326120 3.19075643 8.98048611 3.16849310 0.77157338
## KS_test.D
## 0.06303699
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
$Correlation_plot MDIPID.Imputed.MiceMixed.Diagnostics
##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
$Barcharts MDIPID.Imputed.MiceMixed.Diagnostics
## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 11 : mice mixed
###############################
$Variable_clusters_imp MDIPID.Imputed.MiceMixed.Diagnostics
$Variable_clusters_orig MDIPID.Imputed.MiceMixed.Diagnostics
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDIPID
##################################
# Performing missing data imputation
# on the original dataset using
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::impute_data(MDIPID,
MDIPID.Imputed.HmiscAregImpute scale=F,
n.iter=3,
sel_method=c(15))
## [1] "Hmisc aregImpute imputation - in progress"
##################################
# Performing post-imputation diagnostics
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.HmiscAregImpute.Diagnostics $Hmisc_aregImpute_imputation[[1]],
MDIPID.Imputed.HmiscAregImputescale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
$Histograms MDIPID.Imputed.HmiscAregImpute.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
$Boxplots MDIPID.Imputed.HmiscAregImpute.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
$Statistics MDIPID.Imputed.HmiscAregImpute.Diagnostics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.4676112 2.5033293 12.8291667 2.1772498 0.4273850
## KS_test.D
## 0.1156909
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.86326120 3.19075643 8.76217014 3.27316496 0.80831262
## KS_test.D
## 0.06581891
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
$Correlation_plot MDIPID.Imputed.HmiscAregImpute.Diagnostics
##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
$Barcharts MDIPID.Imputed.HmiscAregImpute.Diagnostics
## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
###############################
$Variable_clusters_imp MDIPID.Imputed.HmiscAregImpute.Diagnostics
$Variable_clusters_orig MDIPID.Imputed.HmiscAregImpute.Diagnostics
##################################
# Loading dataset
##################################
<- TBI.Analysis
MDIPID
##################################
# Performing missing data imputation
# on the original dataset using
# Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::impute_data(MDIPID,
MDIPID.Imputed.VIMkNN scale=F,
n.iter=3,
sel_method=c(16))
## [1] "VIM kNN imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
<- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.VIMkNN.Diagnostics $VIM_kNN_imputation[[1]],
MDIPID.Imputed.VIMkNNscale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
$Histograms MDIPID.Imputed.VIMkNN.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
$Boxplots MDIPID.Imputed.VIMkNN.Diagnostics
## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
$Statistics MDIPID.Imputed.VIMkNN.Diagnostics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.46761124 2.50332935 12.98958333 1.22669396 0.05223604
## KS_test.D
## 0.22775176
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.8632612039 3.1907564349 8.1860677083 1.2654165421 0.0001742062
## KS_test.D
## 0.2199806086
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 12 : VIM kNN
##################################
$Correlation_plot MDIPID.Imputed.VIMkNN.Diagnostics
##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
$Barcharts MDIPID.Imputed.VIMkNN.Diagnostics
## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
###############################
$Variable_clusters_imp MDIPID.Imputed.VIMkNN.Diagnostics
$Variable_clusters_orig MDIPID.Imputed.VIMkNN.Diagnostics