##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import os
%matplotlib inline

from operator import add,mul,truediv

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

from scipy import stats

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV 

import shap

##################################
# Filtering out unncessary warnings
##################################
import warnings
warnings.filterwarnings('ignore')

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"

# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
cancer_rate = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "CategoricalCancerRates.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate.dtypes)

Column Names and Data Types:

COUNTRY     object
CANRAT      object
GDPPER     float64
URBPOP     float64
PATRES     float64
RNDGDP     float64
POPGRO     float64
LIFEXP     float64
TUBINC     float64
DTHCMD     float64
AGRLND     float64
GHGEMI     float64
RELOUT     float64
METEMI     float64
FORARE     float64
CO2EMI     float64
PM2EXP     float64
POPDEN     float64
ENRTER     float64
GDPCAP     float64
HDICAT      object
EPISCO     float64
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
cancer_rate.head()

##################################
# Setting the levels of the categorical variables
##################################
cancer_rate['CANRAT'] = cancer_rate['CANRAT'].astype('category')
cancer_rate['CANRAT'] = cancer_rate['CANRAT'].cat.set_categories(['Low', 'High'], ordered=True)
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].astype('category')
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].cat.set_categories(['L', 'M', 'H', 'VH'], ordered=True)

##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(cancer_rate.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Performing a general exploration of the object variable
##################################
print('Object Variable Summary:')
display(cancer_rate.describe(include='object').transpose())

Object Variable Summary:

##################################
# Performing a general exploration of the categorical variables
##################################
print('Categorical Variable Summary:')
display(cancer_rate.describe(include='category').transpose())

Categorical Variable Summary:

##################################
# Performing a general exploration of the categorical variable
##################################
cancer_rate.HDICAT.value_counts(normalize = True)

HDICAT
VH    0.353293
H     0.233533
M     0.221557
L     0.191617
Name: proportion, dtype: float64

##################################
# Performing a general exploration of the response variable
##################################
cancer_rate.CANRAT.value_counts(normalize = True)

CANRAT
Low     0.745763
High    0.254237
Name: proportion, dtype: float64

##################################
# Counting the number of duplicated rows
##################################
cancer_rate.duplicated().sum()

np.int64(0)

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate)] * len(cancer_rate.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])

20

##################################
# Identifying the columns
# with Fill.Rate < 1.00
##################################
display(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)].sort_values(by=['Fill.Rate'], ascending=True))

##################################
# Identifying the rows
# with Fill.Rate < 0.90
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<0.90)]

##################################
# Gathering the metadata labels for each observation
##################################
row_metadata_list = cancer_rate["COUNTRY"].values.tolist()

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(cancer_rate.columns)] * len(cancer_rate))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(cancer_rate.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Identifying the rows
# with missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_metadata_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Missing.Rate > 0.00
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.00)])

120

##################################
# Counting the number of rows
# with Missing.Rate > 0.20
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)])

14

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
row_high_missing_rate = all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)]

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
display(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)].sort_values(by=['Missing.Rate'], ascending=False))

##################################
# Formulating the dataset
# with numeric columns only
##################################
cancer_rate_numeric = cancer_rate.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = cancer_rate_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = cancer_rate_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = cancer_rate_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = cancer_rate_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = cancer_rate_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[0] for x in cancer_rate_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[1] for x in cancer_rate_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = cancer_rate_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(cancer_rate_numeric)] * len(cancer_rate_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = cancer_rate_numeric.kurtosis()

numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)])

1

##################################
# Identifying the numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)].sort_values(by=['First.Second.Mode.Ratio'], ascending=False))

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

5

##################################
# Identifying the numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))].sort_values(by=['Skewness'], ascending=False))

##################################
# Formulating the dataset
# with object column only
##################################
cancer_rate_object = cancer_rate.select_dtypes(include='object')

##################################
# Gathering the variable names for the object column
##################################
object_variable_name_list = cancer_rate_object.columns

##################################
# Gathering the first mode values for the object column
##################################
object_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_object]

##################################
# Gathering the second mode values for each object column
##################################
object_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_object]

##################################
# Gathering the count of first mode values for each object column
##################################
object_first_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_object]

##################################
# Gathering the count of second mode values for each object column
##################################
object_second_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_object]

##################################
# Gathering the first mode to second mode ratio for each object column
##################################
object_first_second_mode_ratio_list = map(truediv, object_first_mode_count_list, object_second_mode_count_list)

##################################
# Gathering the count of unique values for each object column
##################################
object_unique_count_list = cancer_rate_object.nunique(dropna=True)

##################################
# Gathering the number of observations for each object column
##################################
object_row_count_list = list([len(cancer_rate_object)] * len(cancer_rate_object.columns))

##################################
# Gathering the unique to count ratio for each object column
##################################
object_unique_count_ratio_list = map(truediv, object_unique_count_list, object_row_count_list)

object_column_quality_summary = pd.DataFrame(zip(object_variable_name_list,
                                                 object_first_mode_list,
                                                 object_second_mode_list,
                                                 object_first_mode_count_list,
                                                 object_second_mode_count_list,
                                                 object_first_second_mode_ratio_list,
                                                 object_unique_count_list,
                                                 object_row_count_list,
                                                 object_unique_count_ratio_list), 
                                        columns=['Object.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(object_column_quality_summary)

##################################
# Counting the number of object columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of object columns
# with Unique.Count.Ratio > 10.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Formulating the dataset
# with categorical columns only
##################################
cancer_rate_categorical = cancer_rate.select_dtypes(include='category')

##################################
# Gathering the variable names for the categorical column
##################################
categorical_variable_name_list = cancer_rate_categorical.columns

##################################
# Gathering the first mode values for each categorical column
##################################
categorical_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_categorical]

##################################
# Gathering the second mode values for each categorical column
##################################
categorical_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_categorical]

##################################
# Gathering the count of first mode values for each categorical column
##################################
categorical_first_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the count of second mode values for each categorical column
##################################
categorical_second_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the first mode to second mode ratio for each categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each categorical column
##################################
categorical_unique_count_list = cancer_rate_categorical.nunique(dropna=True)

##################################
# Gathering the number of observations for each categorical column
##################################
categorical_row_count_list = list([len(cancer_rate_categorical)] * len(cancer_rate_categorical.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                    categorical_first_mode_list,
                                                    categorical_second_mode_list,
                                                    categorical_first_mode_count_list,
                                                    categorical_second_mode_count_list,
                                                    categorical_first_second_mode_ratio_list,
                                                    categorical_unique_count_list,
                                                    categorical_row_count_list,
                                                    categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Performing a general exploration of the original dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Filtering out the rows with
# with Missing.Rate > 0.20
##################################
cancer_rate_filtered_row = cancer_rate.drop(cancer_rate[cancer_rate.COUNTRY.isin(row_high_missing_rate['Row.Name'].values.tolist())].index)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_filtered_row.shape)

Dataset Dimensions:

(163, 22)

##################################
# Filtering out the columns with
# with Fill.Rate < 0.90
##################################
cancer_rate_filtered_row_column = cancer_rate_filtered_row.drop(column_low_fill_rate['Column.Name'].values.tolist(), axis=1)

##################################
# Formulating a new dataset object
# for the cleaned data
##################################
cancer_rate_cleaned = cancer_rate_filtered_row_column

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_cleaned.shape)

Dataset Dimensions:

(163, 18)

##################################
# Formulating the summary
# for all cleaned columns
##################################
cleaned_column_quality_summary = pd.DataFrame(zip(list(cancer_rate_cleaned.columns),
                                                  list(cancer_rate_cleaned.dtypes),
                                                  list([len(cancer_rate_cleaned)] * len(cancer_rate_cleaned.columns)),
                                                  list(cancer_rate_cleaned.count()),
                                                  list(cancer_rate_cleaned.isna().sum(axis=0))), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count'])
display(cleaned_column_quality_summary)

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='object')

##################################
# Formulating the cleaned dataset
# with numeric columns only
##################################
cancer_rate_cleaned_numeric = cancer_rate_cleaned.select_dtypes(include='number')

##################################
# Taking a snapshot of the cleaned dataset
##################################
cancer_rate_cleaned_numeric.head()

##################################
# Defining the estimator to be used
# at each step of the round-robin imputation
##################################
lr = LinearRegression()

##################################
# Defining the parameter of the
# iterative imputer which will estimate 
# the columns with missing values
# as a function of the other columns
# in a round-robin fashion
##################################
iterative_imputer = IterativeImputer(
    estimator = lr,
    max_iter = 10,
    tol = 1e-10,
    imputation_order = 'ascending',
    random_state=88888888
)

##################################
# Implementing the iterative imputer 
##################################
cancer_rate_imputed_numeric_array = iterative_imputer.fit_transform(cancer_rate_cleaned_numeric)

##################################
# Transforming the imputed data
# from an array to a dataframe
##################################
cancer_rate_imputed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array, 
                                           columns = cancer_rate_cleaned_numeric.columns)

##################################
# Taking a snapshot of the imputed dataset
##################################
cancer_rate_imputed_numeric.head()

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='category')

##################################
# Imputing the missing data
# for categorical columns with
# the most frequent category
##################################
cancer_rate_cleaned_categorical['HDICAT'] = cancer_rate_cleaned_categorical['HDICAT'].fillna(cancer_rate_cleaned_categorical['HDICAT'].mode()[0])
cancer_rate_imputed_categorical = cancer_rate_cleaned_categorical.reset_index(drop=True)

##################################
# Formulating the imputed dataset
##################################
cancer_rate_imputed = pd.concat([cancer_rate_imputed_numeric,cancer_rate_imputed_categorical], axis=1, join='inner')

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate_imputed.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate_imputed.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate_imputed)] * len(cancer_rate_imputed.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate_imputed.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate_imputed.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all imputed columns
##################################
imputed_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                                  data_type_list,
                                                  row_count_list,
                                                  non_null_count_list,
                                                  null_count_list,
                                                  fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(imputed_column_quality_summary)

##################################
# Formulating the imputed dataset
# with numeric columns only
##################################
cancer_rate_imputed_numeric = cancer_rate_imputed.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = list(cancer_rate_imputed_numeric.columns)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_imputed_numeric.skew()

##################################
# Computing the interquartile range
# for all columns
##################################
cancer_rate_imputed_numeric_q1 = cancer_rate_imputed_numeric.quantile(0.25)
cancer_rate_imputed_numeric_q3 = cancer_rate_imputed_numeric.quantile(0.75)
cancer_rate_imputed_numeric_iqr = cancer_rate_imputed_numeric_q3 - cancer_rate_imputed_numeric_q1

##################################
# Gathering the outlier count for each numeric column
# based on the interquartile range criterion
##################################
numeric_outlier_count_list = ((cancer_rate_imputed_numeric < (cancer_rate_imputed_numeric_q1 - 1.5 * cancer_rate_imputed_numeric_iqr)) | (cancer_rate_imputed_numeric > (cancer_rate_imputed_numeric_q3 + 1.5 * cancer_rate_imputed_numeric_iqr))).sum()

##################################
# Gathering the number of observations for each column
##################################
numeric_row_count_list = list([len(cancer_rate_imputed_numeric)] * len(cancer_rate_imputed_numeric.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
numeric_outlier_ratio_list = map(truediv, numeric_outlier_count_list, numeric_row_count_list)

##################################
# Formulating the outlier summary
# for all numeric columns
##################################
numeric_column_outlier_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                  numeric_skewness_list,
                                                  numeric_outlier_count_list,
                                                  numeric_row_count_list,
                                                  numeric_outlier_ratio_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Skewness',
                                                 'Outlier.Count',
                                                 'Row.Count',
                                                 'Outlier.Ratio'])
display(numeric_column_outlier_summary)

##################################
# Formulating the individual boxplots
# for all numeric columns
##################################
for column in cancer_rate_imputed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_imputed_numeric, x=column)

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
def plot_correlation_matrix(corr, mask=None):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, 
                ax=ax,
                mask=mask,
                annot=True, 
                vmin=-1, 
                vmax=1, 
                center=0,
                cmap='coolwarm', 
                linewidths=1, 
                linecolor='gray', 
                cbar_kws={'orientation': 'horizontal'})

##################################
# Computing the correlation coefficients
# and correlation p-values
# among pairs of numeric columns
##################################
cancer_rate_imputed_numeric_correlation_pairs = {}
cancer_rate_imputed_numeric_columns = cancer_rate_imputed_numeric.columns.tolist()
for numeric_column_a, numeric_column_b in itertools.combinations(cancer_rate_imputed_numeric_columns, 2):
    cancer_rate_imputed_numeric_correlation_pairs[numeric_column_a + '_' + numeric_column_b] = stats.pearsonr(
        cancer_rate_imputed_numeric.loc[:, numeric_column_a], 
        cancer_rate_imputed_numeric.loc[:, numeric_column_b])

##################################
# Formulating the pairwise correlation summary
# for all numeric columns
##################################
cancer_rate_imputed_numeric_summary = cancer_rate_imputed_numeric.from_dict(cancer_rate_imputed_numeric_correlation_pairs, orient='index')
cancer_rate_imputed_numeric_summary.columns = ['Pearson.Correlation.Coefficient', 'Correlation.PValue']
display(cancer_rate_imputed_numeric_summary.sort_values(by=['Pearson.Correlation.Coefficient'], ascending=False).head(20))

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
cancer_rate_imputed_numeric_correlation = cancer_rate_imputed_numeric.corr()
mask = np.triu(cancer_rate_imputed_numeric_correlation)
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)
plt.show()

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
def correlation_significance(df=None):
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
    return p_matrix

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
cancer_rate_imputed_numeric_correlation_p_values = correlation_significance(cancer_rate_imputed_numeric)                     
mask = np.invert(np.tril(cancer_rate_imputed_numeric_correlation_p_values<0.05)) 
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)

##################################
# Filtering out one among the 
# highly correlated variable pairs with
# lesser Pearson.Correlation.Coefficient
# when compared to the target variable
##################################
cancer_rate_imputed_numeric.drop(['GDPPER','METEMI'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_imputed_numeric.shape)

Dataset Dimensions:

(163, 13)

##################################
# Conducting a Yeo-Johnson Transformation
# to address the distributional
# shape of the variables
##################################
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson',
                                          standardize=False)
cancer_rate_imputed_numeric_array = yeo_johnson_transformer.fit_transform(cancer_rate_imputed_numeric)

##################################
# Formulating a new dataset object
# for the transformed data
##################################
cancer_rate_transformed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array,
                                               columns=cancer_rate_imputed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_transformed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_transformed_numeric, x=column)

##################################
# Filtering out the column
# which remained skewed even
# after applying shape transformation
##################################
cancer_rate_transformed_numeric.drop(['PM2EXP'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_transformed_numeric.shape)

Dataset Dimensions:

(163, 12)

##################################
# Conducting standardization
# to transform the values of the 
# variables into comparable scale
##################################
standardization_scaler = StandardScaler()
cancer_rate_transformed_numeric_array = standardization_scaler.fit_transform(cancer_rate_transformed_numeric)

##################################
# Formulating a new dataset object
# for the scaled data
##################################
cancer_rate_scaled_numeric = pd.DataFrame(cancer_rate_transformed_numeric_array,
                                          columns=cancer_rate_transformed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_scaled_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_scaled_numeric, x=column)

##################################
# Formulating the categorical column
# for encoding transformation
##################################
cancer_rate_categorical_encoded = pd.DataFrame(cancer_rate_cleaned_categorical.loc[:, 'HDICAT'].to_list(),
                                               columns=['HDICAT'])

##################################
# Applying a one-hot encoding transformation
# for the categorical column
##################################
cancer_rate_categorical_encoded = pd.get_dummies(cancer_rate_categorical_encoded, columns=['HDICAT'])

##################################
# Consolidating both numeric columns
# and encoded categorical columns
##################################
cancer_rate_preprocessed = pd.concat([cancer_rate_scaled_numeric,cancer_rate_categorical_encoded], axis=1, join='inner')

##################################
# Performing a general exploration of the consolidated dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_preprocessed.shape)

Dataset Dimensions:

(163, 16)

##################################
# Segregating the target
# and predictor variable lists
##################################
cancer_rate_preprocessed_target = cancer_rate_filtered_row['CANRAT'].to_frame()
cancer_rate_preprocessed_target.reset_index(inplace=True, drop=True)
cancer_rate_preprocessed_categorical = cancer_rate_preprocessed[cancer_rate_categorical_encoded.columns]
cancer_rate_preprocessed_categorical_combined = cancer_rate_preprocessed_categorical.join(cancer_rate_preprocessed_target)
cancer_rate_preprocessed = cancer_rate_preprocessed.drop(cancer_rate_categorical_encoded.columns, axis=1) 
cancer_rate_preprocessed_predictors = cancer_rate_preprocessed.columns
cancer_rate_preprocessed_combined = cancer_rate_preprocessed.join(cancer_rate_preprocessed_target)
cancer_rate_preprocessed_all = cancer_rate_preprocessed_combined.join(cancer_rate_categorical_encoded)

##################################
# Segregating the target
# and predictor variable names
##################################
y_variable = 'CANRAT'
x_variables = cancer_rate_preprocessed_predictors

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 6
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 30))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual boxplots
# for all scaled numeric columns
##################################
for i, x_variable in enumerate(x_variables):
    ax = axes[i]
    ax.boxplot([group[x_variable] for name, group in cancer_rate_preprocessed_combined.groupby(y_variable, observed=True)])
    ax.set_title(f'{y_variable} Versus {x_variable}')
    ax.set_xlabel(y_variable)
    ax.set_ylabel(x_variable)
    ax.set_xticks(range(1, len(cancer_rate_preprocessed_combined[y_variable].unique()) + 1), ['Low', 'High'])

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Segregating the target
# and predictor variable names
##################################
y_variables = cancer_rate_preprocessed_categorical.columns
x_variable = 'CANRAT'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 2
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(y_variables):
    ax = axes[i]
    category_counts = cancer_rate_preprocessed_categorical_combined.groupby([x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{x_variable} Versus {y_variable}')
    ax.set_xlabel(x_variable)
    ax.set_ylabel('Proportions')

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Computing the t-test 
# statistic and p-values
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_ttest_target = {}
cancer_rate_preprocessed_numeric = cancer_rate_preprocessed_combined
cancer_rate_preprocessed_numeric_columns = cancer_rate_preprocessed_predictors
for numeric_column in cancer_rate_preprocessed_numeric_columns:
    group_0 = cancer_rate_preprocessed_numeric[cancer_rate_preprocessed_numeric.loc[:,'CANRAT']=='Low']
    group_1 = cancer_rate_preprocessed_numeric[cancer_rate_preprocessed_numeric.loc[:,'CANRAT']=='High']
    cancer_rate_preprocessed_numeric_ttest_target['CANRAT_' + numeric_column] = stats.ttest_ind(
        group_0[numeric_column], 
        group_1[numeric_column], 
        equal_var=True)

##################################
# Formulating the pairwise ttest summary
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_summary = cancer_rate_preprocessed_numeric.from_dict(cancer_rate_preprocessed_numeric_ttest_target, orient='index')
cancer_rate_preprocessed_numeric_summary.columns = ['T.Test.Statistic', 'T.Test.PValue']
display(cancer_rate_preprocessed_numeric_summary.sort_values(by=['T.Test.PValue'], ascending=True).head(12))

##################################
# Computing the chisquare
# statistic and p-values
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_chisquare_target = {}
cancer_rate_preprocessed_categorical = cancer_rate_preprocessed_categorical_combined
cancer_rate_preprocessed_categorical_columns = ['HDICAT_L','HDICAT_M','HDICAT_H','HDICAT_VH']
for categorical_column in cancer_rate_preprocessed_categorical_columns:
    contingency_table = pd.crosstab(cancer_rate_preprocessed_categorical[categorical_column], 
                                    cancer_rate_preprocessed_categorical['CANRAT'])
    cancer_rate_preprocessed_categorical_chisquare_target['CANRAT_' + categorical_column] = stats.chi2_contingency(
        contingency_table)[0:2]

##################################
# Formulating the pairwise chisquare summary
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_summary = cancer_rate_preprocessed_categorical.from_dict(cancer_rate_preprocessed_categorical_chisquare_target, orient='index')
cancer_rate_preprocessed_categorical_summary.columns = ['ChiSquare.Test.Statistic', 'ChiSquare.Test.PValue']
display(cancer_rate_preprocessed_categorical_summary.sort_values(by=['ChiSquare.Test.PValue'], ascending=True).head(4))

##################################
# Consolidating relevant numeric columns
# and encoded categorical columns
# after hypothesis testing
##################################
cancer_rate_premodelling = cancer_rate_preprocessed_all.drop(['AGRLND','POPDEN','GHGEMI','POPGRO','FORARE','HDICAT_H','HDICAT_M','HDICAT_L'], axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 9)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

URBPOP        float64
LIFEXP        float64
TUBINC        float64
DTHCMD        float64
CO2EMI        float64
GDPCAP        float64
EPISCO        float64
CANRAT       category
HDICAT_VH        bool
dtype: object

##################################
# Gathering the pairplot for all variables
##################################
cancer_rate_predictor_pair_plot = sns.pairplot(cancer_rate_premodelling,
                                               kind='reg',
                                               markers=["o", "s"],
                                               plot_kws={'scatter_kws': {'alpha': 0.3}},
                                               hue='CANRAT');
sns.move_legend(cancer_rate_predictor_pair_plot, 
                "lower center",
                bbox_to_anchor=(.5, 1), ncol=2, title='CANRAT', frameon=False)
plt.show()

##################################
# Separating the target 
# and predictor columns
##################################
X = cancer_rate_premodelling.drop('CANRAT', axis = 1)
y = cancer_rate_premodelling['CANRAT'].cat.codes

##################################
# Formulating the train and test data
# using a 70-30 ratio
##################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 88888888, stratify=y)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_train.shape)

Dataset Dimensions:

(114, 8)

##################################
# Validating the class distribution of the train dataset
##################################
y_train.value_counts(normalize = True)

0    0.745614
1    0.254386
Name: proportion, dtype: float64

##################################
# Performing a general exploration of the test dataset
##################################
print('Dataset Dimensions: ')
display(X_test.shape)

Dataset Dimensions:

(49, 8)

##################################
# Validating the class distribution of the test dataset
##################################
y_test.value_counts(normalize = True)

0    0.755102
1    0.244898
Name: proportion, dtype: float64

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['Accuracy','Precision','Recall','F1','AUROC']
    metric_value = [accuracy_score(y_true, y_pred),
                   precision_score(y_true, y_pred),
                   recall_score(y_true, y_pred),
                   f1_score(y_true, y_pred),
                   roc_auc_score(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Creating an instance of the 
# Logistic Regression model
##################################
logistic_regression = LogisticRegression()

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear','saga'],
    'class_weight': [None],
    'max_iter': [500],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
optimal_logistic_regression = GridSearchCV(estimator = logistic_regression, 
                                           param_grid = hyperparameter_grid,
                                           n_jobs = -1,
                                           scoring='f1')

##################################
# Fitting the optimal Logistic Regression model
##################################
optimal_logistic_regression.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Logistic Regression model
##################################
optimal_logistic_regression.best_score_ 
optimal_logistic_regression.best_params_

{'C': 1.0,
 'class_weight': None,
 'max_iter': 500,
 'penalty': 'l1',
 'random_state': 88888888,
 'solver': 'liblinear'}

##################################
# Evaluating the optimal Logistic Regression model
# on the train set
##################################
optimal_logistic_regression_y_hat_train = optimal_logistic_regression.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
optimal_logistic_regression_performance_train = model_performance_evaluation(y_train, optimal_logistic_regression_y_hat_train)
optimal_logistic_regression_performance_train['model'] = ['optimal_logistic_regression'] * 5
optimal_logistic_regression_performance_train['set'] = ['train'] * 5
print('Optimal Logistic Regression Model Performance on Train Data: ')
display(optimal_logistic_regression_performance_train)

Optimal Logistic Regression Model Performance on Train Data:

##################################
# Evaluating the optimal Logistic Regression model
# on the test set
##################################
optimal_logistic_regression_y_hat_test = optimal_logistic_regression.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
optimal_logistic_regression_performance_test = model_performance_evaluation(y_test, optimal_logistic_regression_y_hat_test)
optimal_logistic_regression_performance_test['model'] = ['optimal_logistic_regression'] * 5
optimal_logistic_regression_performance_test['set'] = ['test'] * 5
print('Optimal Logistic Regression Model Performance on Test Data: ')
display(optimal_logistic_regression_performance_test)

Optimal Logistic Regression Model Performance on Test Data:

##################################
# Creating an instance of the 
# Decision Tree model
##################################
decision_tree = DecisionTreeClassifier()

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
optimal_decision_tree = GridSearchCV(estimator = decision_tree, 
                                     param_grid = hyperparameter_grid,
                                     n_jobs = -1,
                                     scoring='f1')

##################################
# Fitting the optimal Decision Tree model
##################################
optimal_decision_tree.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Decision Tree model
##################################
optimal_decision_tree.best_score_ 
optimal_decision_tree.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 3,
 'random_state': 88888888}

##################################
# Evaluating the optimal decision tree model
# on the train set
##################################
optimal_decision_tree_y_hat_train = optimal_decision_tree.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
optimal_decision_tree_performance_train = model_performance_evaluation(y_train, optimal_decision_tree_y_hat_train)
optimal_decision_tree_performance_train['model'] = ['optimal_decision_tree'] * 5
optimal_decision_tree_performance_train['set'] = ['train'] * 5
print('Optimal Decision Tree Model Performance on Train Data: ')
display(optimal_decision_tree_performance_train)

Optimal Decision Tree Model Performance on Train Data:

##################################
# Evaluating the optimal decision tree model
# on the test set
##################################
optimal_decision_tree_y_hat_test = optimal_decision_tree.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
optimal_decision_tree_performance_test = model_performance_evaluation(y_test, optimal_decision_tree_y_hat_test)
optimal_decision_tree_performance_test['model'] = ['optimal_decision_tree'] * 5
optimal_decision_tree_performance_test['set'] = ['test'] * 5
print('Optimal Decision Tree Model Performance on Test Data: ')
display(optimal_decision_tree_performance_test)

Optimal Decision Tree Model Performance on Test Data:

##################################
# Creating an instance of the 
# Random Forest model
##################################
random_forest = RandomForestClassifier()

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'n_estimators': [100,150,200],
    'max_features':['sqrt', 'log2'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
optimal_random_forest = GridSearchCV(estimator = random_forest, 
                                     param_grid = hyperparameter_grid,
                                     n_jobs = -1,
                                     scoring='f1')

##################################
# Fitting the optimal Random Forest model
##################################
optimal_random_forest.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Random Forest model
##################################
optimal_random_forest.best_score_ 
optimal_random_forest.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 100,
 'random_state': 88888888}

##################################
# Evaluating the optimal Random Forest model
# on the train set
##################################
optimal_random_forest_y_hat_train = optimal_random_forest.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
optimal_random_forest_performance_train = model_performance_evaluation(y_train, optimal_random_forest_y_hat_train)
optimal_random_forest_performance_train['model'] = ['optimal_random_forest'] * 5
optimal_random_forest_performance_train['set'] = ['train'] * 5
print('Optimal Random Forest Model Performance on Train Data: ')
display(optimal_random_forest_performance_train)

Optimal Random Forest Model Performance on Train Data:

##################################
# Evaluating the optimal Random Forest model
# on the test set
##################################
optimal_random_forest_y_hat_test = optimal_random_forest.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
optimal_random_forest_performance_test = model_performance_evaluation(y_test, optimal_random_forest_y_hat_test)
optimal_random_forest_performance_test['model'] = ['optimal_random_forest'] * 5
optimal_random_forest_performance_test['set'] = ['test'] * 5
print('Optimal Random Forest Model Performance on Test Data: ')
display(optimal_random_forest_performance_test)

Optimal Random Forest Model Performance on Test Data:

##################################
# Creating an instance of the 
# Support Vector Machine model
##################################
support_vector_machine = SVC()

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
optimal_support_vector_machine = GridSearchCV(estimator = support_vector_machine, 
                                              param_grid = hyperparameter_grid,
                                              n_jobs = -1,
                                              scoring='f1')

##################################
# Fitting the optimal Support Vector Machine model
##################################
optimal_support_vector_machine.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Support Vector Machine model
##################################
optimal_support_vector_machine.best_score_ 
optimal_support_vector_machine.best_params_

{'C': 1.0, 'class_weight': None, 'kernel': 'poly', 'random_state': 88888888}

##################################
# Evaluating the optimal Support Vector Machine model
# on the train set
##################################
optimal_support_vector_machine_y_hat_train = optimal_support_vector_machine.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
optimal_support_vector_machine_performance_train = model_performance_evaluation(y_train, optimal_support_vector_machine_y_hat_train)
optimal_support_vector_machine_performance_train['model'] = ['optimal_support_vector_machine'] * 5
optimal_support_vector_machine_performance_train['set'] = ['train'] * 5
print('Optimal Support Vector Machine Model Performance on Train Data: ')
display(optimal_support_vector_machine_performance_train)

Optimal Support Vector Machine Model Performance on Train Data:

##################################
# Evaluating the optimal Support Vector Machine model
# on the test set
##################################
optimal_support_vector_machine_y_hat_test = optimal_support_vector_machine.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
optimal_support_vector_machine_performance_test = model_performance_evaluation(y_test, optimal_support_vector_machine_y_hat_test)
optimal_support_vector_machine_performance_test['model'] = ['optimal_support_vector_machine'] * 5
optimal_support_vector_machine_performance_test['set'] = ['test'] * 5
print('Optimal Support Vector Machine Model Performance on Test Data: ')
display(optimal_support_vector_machine_performance_test)

Optimal Support Vector Machine Model Performance on Test Data:

##################################
# Consolidating relevant numeric columns
# and encoded categorical columns
# after hypothesis testing
##################################
cancer_rate_premodelling = cancer_rate_preprocessed_all.drop(['AGRLND','POPDEN','GHGEMI','POPGRO','FORARE','HDICAT_H','HDICAT_M','HDICAT_L'], axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 9)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

URBPOP        float64
LIFEXP        float64
TUBINC        float64
DTHCMD        float64
CO2EMI        float64
GDPCAP        float64
EPISCO        float64
CANRAT       category
HDICAT_VH        bool
dtype: object

##################################
# Gathering the pairplot for all variables
##################################
cancer_rate_predictor_pair_plot = sns.pairplot(cancer_rate_premodelling,
                                               kind='reg',
                                               markers=["o", "s"],
                                               plot_kws={'scatter_kws': {'alpha': 0.3}},
                                               hue='CANRAT');
sns.move_legend(cancer_rate_predictor_pair_plot, 
                "lower center",
                bbox_to_anchor=(.5, 1), ncol=2, title='CANRAT', frameon=False)
plt.show()

##################################
# Separating the target 
# and predictor columns
##################################
X = cancer_rate_premodelling.drop('CANRAT', axis = 1)
y = cancer_rate_premodelling['CANRAT'].cat.codes

##################################
# Formulating the train and test data
# using a 70-30 ratio
##################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 88888888, stratify=y)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_train.shape)

Dataset Dimensions:

(114, 8)

##################################
# Validating the class distribution of the train dataset
##################################
y_train.value_counts(normalize = True)

0    0.745614
1    0.254386
Name: proportion, dtype: float64

##################################
# Performing a general exploration of the test dataset
##################################
print('Dataset Dimensions: ')
display(X_test.shape)

Dataset Dimensions:

(49, 8)

##################################
# Validating the class distribution of the test dataset
##################################
y_test.value_counts(normalize = True)

0    0.755102
1    0.244898
Name: proportion, dtype: float64

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['Accuracy','Precision','Recall','F1','AUROC']
    metric_value = [accuracy_score(y_true, y_pred),
                   precision_score(y_true, y_pred),
                   recall_score(y_true, y_pred),
                   f1_score(y_true, y_pred),
                   roc_auc_score(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Creating an instance of the 
# Logistic Regression model
##################################
logistic_regression = LogisticRegression()

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear','saga'],
    'class_weight': [{0:0.25, 1:0.75}],
    'max_iter': [500],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
weighted_logistic_regression = GridSearchCV(estimator = logistic_regression, 
                                           param_grid = hyperparameter_grid,
                                           scoring='f1')

##################################
# Fitting the weighted Logistic Regression model
##################################
weighted_logistic_regression.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Logistic Regression model
##################################
weighted_logistic_regression.best_score_ 
weighted_logistic_regression.best_params_

{'C': 1.0,
 'class_weight': {0: 0.25, 1: 0.75},
 'max_iter': 500,
 'penalty': 'l2',
 'random_state': 88888888,
 'solver': 'liblinear'}

##################################
# Evaluating the weighted Logistic Regression model
# on the train set
##################################
weighted_logistic_regression_y_hat_train = weighted_logistic_regression.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
weighted_logistic_regression_performance_train = model_performance_evaluation(y_train, weighted_logistic_regression_y_hat_train)
weighted_logistic_regression_performance_train['model'] = ['weighted_logistic_regression'] * 5
weighted_logistic_regression_performance_train['set'] = ['train'] * 5
print('Weighted Logistic Regression Model Performance on Train Data: ')
display(weighted_logistic_regression_performance_train)

Weighted Logistic Regression Model Performance on Train Data:

##################################
# Evaluating the weighted Logistic Regression model
# on the test set
##################################
weighted_logistic_regression_y_hat_test = weighted_logistic_regression.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
weighted_logistic_regression_performance_test = model_performance_evaluation(y_test, weighted_logistic_regression_y_hat_test)
weighted_logistic_regression_performance_test['model'] = ['weighted_logistic_regression'] * 5
weighted_logistic_regression_performance_test['set'] = ['test'] * 5
print('Weighted Logistic Regression Model Performance on Test Data: ')
display(weighted_logistic_regression_performance_test)

Weighted Logistic Regression Model Performance on Test Data:

##################################
# Creating an instance of the 
# Decision Tree model
##################################
decision_tree = DecisionTreeClassifier()

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'class_weight': [{0:0.25, 1:0.75}],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
weighted_decision_tree = GridSearchCV(estimator = decision_tree, 
                                      param_grid = hyperparameter_grid,
                                      n_jobs = -1,
                                      scoring='f1')

##################################
# Fitting the weighted Decision Tree model
##################################
weighted_decision_tree.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Decision Tree model
##################################
weighted_decision_tree.best_score_ 
weighted_decision_tree.best_params_

{'class_weight': {0: 0.25, 1: 0.75},
 'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 3,
 'random_state': 88888888}

##################################
# Evaluating the weighted decision tree model
# on the train set
##################################
weighted_decision_tree_y_hat_train = weighted_decision_tree.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
weighted_decision_tree_performance_train = model_performance_evaluation(y_train, weighted_decision_tree_y_hat_train)
weighted_decision_tree_performance_train['model'] = ['weighted_decision_tree'] * 5
weighted_decision_tree_performance_train['set'] = ['train'] * 5
print('Weighted Decision Tree Model Performance on Train Data: ')
display(weighted_decision_tree_performance_train)

Weighted Decision Tree Model Performance on Train Data:

##################################
# Evaluating the weighted decision tree model
# on the test set
##################################
weighted_decision_tree_y_hat_test = weighted_decision_tree.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
weighted_decision_tree_performance_test = model_performance_evaluation(y_test, weighted_decision_tree_y_hat_test)
weighted_decision_tree_performance_test['model'] = ['weighted_decision_tree'] * 5
weighted_decision_tree_performance_test['set'] = ['test'] * 5
print('Weighted Decision Tree Model Performance on Test Data: ')
display(weighted_decision_tree_performance_test)

Weighted Decision Tree Model Performance on Test Data:

##################################
# Creating an instance of the 
# Random Forest model
##################################
random_forest = RandomForestClassifier()

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'n_estimators': [100,150,200],
    'max_features':['sqrt', 'log2'],
    'class_weight': [{0:0.25, 1:0.75}],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
weighted_random_forest = GridSearchCV(estimator = random_forest, 
                                      param_grid = hyperparameter_grid,
                                      n_jobs = -1,
                                      scoring='f1')

##################################
# Fitting the weighted Random Forest model
##################################
weighted_random_forest.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Random Forest model
##################################
weighted_random_forest.best_score_ 
weighted_random_forest.best_params_

{'class_weight': {0: 0.25, 1: 0.75},
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 100,
 'random_state': 88888888}

##################################
# Evaluating the weighted Random Forest model
# on the train set
##################################
weighted_random_forest_y_hat_train = weighted_random_forest.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
weighted_random_forest_performance_train = model_performance_evaluation(y_train, weighted_random_forest_y_hat_train)
weighted_random_forest_performance_train['model'] = ['weighted_random_forest'] * 5
weighted_random_forest_performance_train['set'] = ['train'] * 5
print('Weighted Random Forest Model Performance on Train Data: ')
display(weighted_random_forest_performance_train)

Weighted Random Forest Model Performance on Train Data:

##################################
# Evaluating the weighted Random Forest model
# on the test set
##################################
weighted_random_forest_y_hat_test = weighted_random_forest.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
weighted_random_forest_performance_test = model_performance_evaluation(y_test, weighted_random_forest_y_hat_test)
weighted_random_forest_performance_test['model'] = ['weighted_random_forest'] * 5
weighted_random_forest_performance_test['set'] = ['test'] * 5
print('Weighted Random Forest Model Performance on Test Data: ')
display(weighted_random_forest_performance_test)

Weighted Random Forest Model Performance on Test Data:

##################################
# Creating an instance of the 
# Support Vector Machine model
##################################
support_vector_machine = SVC()

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': [{0:0.25, 1:0.75}],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
weighted_support_vector_machine = GridSearchCV(estimator = support_vector_machine, 
                                               param_grid = hyperparameter_grid,
                                               n_jobs = -1,
                                               scoring='f1')

##################################
# Fitting the weighted Support Vector Machine model
##################################
weighted_support_vector_machine.fit(X_train, y_train)

##################################
# Determining the optimal hyperparameter
# for the Support Vector Machine model
##################################
weighted_support_vector_machine.best_score_ 
weighted_support_vector_machine.best_params_

{'C': 1.0,
 'class_weight': {0: 0.25, 1: 0.75},
 'kernel': 'poly',
 'random_state': 88888888}

##################################
# Evaluating the weighted Support Vector Machine model
# on the train set
##################################
weighted_support_vector_machine_y_hat_train = weighted_support_vector_machine.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
weighted_support_vector_machine_performance_train = model_performance_evaluation(y_train, weighted_support_vector_machine_y_hat_train)
weighted_support_vector_machine_performance_train['model'] = ['weighted_support_vector_machine'] * 5
weighted_support_vector_machine_performance_train['set'] = ['train'] * 5
print('Weighted Support Vector Machine Model Performance on Train Data: ')
display(weighted_support_vector_machine_performance_train)

Weighted Support Vector Machine Model Performance on Train Data:

##################################
# Evaluating the weighted Support Vector Machine model
# on the test set
##################################
weighted_support_vector_machine_y_hat_test = weighted_support_vector_machine.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
weighted_support_vector_machine_performance_test = model_performance_evaluation(y_test, weighted_support_vector_machine_y_hat_test)
weighted_support_vector_machine_performance_test['model'] = ['weighted_support_vector_machine'] * 5
weighted_support_vector_machine_performance_test['set'] = ['test'] * 5
print('Weighted Support Vector Machine Model Performance on Test Data: ')
display(weighted_support_vector_machine_performance_test)

Weighted Support Vector Machine Model Performance on Test Data:

##################################
# Consolidating relevant numeric columns
# and encoded categorical columns
# after hypothesis testing
##################################
cancer_rate_premodelling = cancer_rate_preprocessed_all.drop(['AGRLND','POPDEN','GHGEMI','POPGRO','FORARE','HDICAT_H','HDICAT_M','HDICAT_L'], axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 9)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

URBPOP        float64
LIFEXP        float64
TUBINC        float64
DTHCMD        float64
CO2EMI        float64
GDPCAP        float64
EPISCO        float64
CANRAT       category
HDICAT_VH        bool
dtype: object

##################################
# Gathering the pairplot for all variables
##################################
cancer_rate_predictor_pair_plot = sns.pairplot(cancer_rate_premodelling,
                                               kind='reg',
                                               markers=["o", "s"],
                                               plot_kws={'scatter_kws': {'alpha': 0.3}},
                                               hue='CANRAT');
sns.move_legend(cancer_rate_predictor_pair_plot, 
                "lower center",
                bbox_to_anchor=(.5, 1), ncol=2, title='CANRAT', frameon=False)
plt.show()

##################################
# Separating the target 
# and predictor columns
##################################
X = cancer_rate_premodelling.drop('CANRAT', axis = 1)
y = cancer_rate_premodelling['CANRAT'].cat.codes

##################################
# Formulating the train and test data
# using a 70-30 ratio
##################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 88888888, stratify=y)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_train.shape)

Dataset Dimensions:

(114, 8)

##################################
# Validating the class distribution of the train dataset
##################################
y_train.value_counts(normalize = True)

0    0.745614
1    0.254386
Name: proportion, dtype: float64

##################################
# Initiating an oversampling instance
# on the train data using
# Synthetic Minority Oversampling Technique
##################################
smote = SMOTE(random_state = 88888888)
X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)

##################################
# Performing a general exploration of the overampled train dataset
##################################
print('Dataset Dimensions: ')
display(X_train_smote.shape)

Dataset Dimensions:

(170, 8)

##################################
# Validating the class distribution of the overampled train dataset
##################################
y_train_smote.value_counts(normalize = True)

0    0.5
1    0.5
Name: proportion, dtype: float64

##################################
# Performing a general exploration of the test dataset
##################################
print('Dataset Dimensions: ')
display(X_test.shape)

Dataset Dimensions:

(49, 8)

##################################
# Validating the class distribution of the test dataset
##################################
y_test.value_counts(normalize = True)

0    0.755102
1    0.244898
Name: proportion, dtype: float64

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['Accuracy','Precision','Recall','F1','AUROC']
    metric_value = [accuracy_score(y_true, y_pred),
                   precision_score(y_true, y_pred),
                   recall_score(y_true, y_pred),
                   f1_score(y_true, y_pred),
                   roc_auc_score(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Creating an instance of the 
# Logistic Regression model
##################################
logistic_regression = LogisticRegression()

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear','saga'],
    'class_weight': [None],
    'max_iter': [500],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
upsampled_logistic_regression = GridSearchCV(estimator = logistic_regression, 
                                             param_grid = hyperparameter_grid,
                                             n_jobs = -1,
                                             scoring='f1')

##################################
# Fitting the upsampled Logistic Regression model
##################################
upsampled_logistic_regression.fit(X_train_smote, y_train_smote)

##################################
# Determining the optimal hyperparameter
# for the Logistic Regression model
##################################
upsampled_logistic_regression.best_score_ 
upsampled_logistic_regression.best_params_

{'C': 1.0,
 'class_weight': None,
 'max_iter': 500,
 'penalty': 'l1',
 'random_state': 88888888,
 'solver': 'saga'}

##################################
# Evaluating the upsampled Logistic Regression model
# on the train set
##################################
upsampled_logistic_regression_y_hat_train = upsampled_logistic_regression.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_logistic_regression_performance_train = model_performance_evaluation(y_train, upsampled_logistic_regression_y_hat_train)
upsampled_logistic_regression_performance_train['model'] = ['upsampled_logistic_regression'] * 5
upsampled_logistic_regression_performance_train['set'] = ['train'] * 5
print('Upsampled Logistic Regression Model Performance on Train Data: ')
display(upsampled_logistic_regression_performance_train)

Upsampled Logistic Regression Model Performance on Train Data:

##################################
# Evaluating the upsampled Logistic Regression model
# on the test set
##################################
upsampled_logistic_regression_y_hat_test = upsampled_logistic_regression.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_logistic_regression_performance_test = model_performance_evaluation(y_test, upsampled_logistic_regression_y_hat_test)
upsampled_logistic_regression_performance_test['model'] = ['upsampled_logistic_regression'] * 5
upsampled_logistic_regression_performance_test['set'] = ['test'] * 5
print('Upsampled Logistic Regression Model Performance on Test Data: ')
display(upsampled_logistic_regression_performance_test)

Upsampled Logistic Regression Model Performance on Test Data:

##################################
# Creating an instance of the 
# Decision Tree model
##################################
decision_tree = DecisionTreeClassifier()

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
upsampled_decision_tree = GridSearchCV(estimator = decision_tree, 
                                       param_grid = hyperparameter_grid,
                                       n_jobs = -1,
                                       scoring='f1')

##################################
# Fitting the upsampled Decision Tree model
##################################
upsampled_decision_tree.fit(X_train_smote, y_train_smote)

##################################
# Determining the optimal hyperparameter
# for the Decision Tree model
##################################
upsampled_decision_tree.best_score_ 
upsampled_decision_tree.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 3,
 'min_samples_leaf': 5,
 'random_state': 88888888}

##################################
# Evaluating the upsampled Decision Tree model
# on the train set
##################################
upsampled_decision_tree_y_hat_train = upsampled_decision_tree.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_decision_tree_performance_train = model_performance_evaluation(y_train, upsampled_decision_tree_y_hat_train)
upsampled_decision_tree_performance_train['model'] = ['upsampled_decision_tree'] * 5
upsampled_decision_tree_performance_train['set'] = ['train'] * 5
print('Upsampled Decision Tree Model Performance on Train Data: ')
display(upsampled_decision_tree_performance_train)

Upsampled Decision Tree Model Performance on Train Data:

##################################
# Evaluating the upsampled Decision Tree model
# on the test set
##################################
upsampled_decision_tree_y_hat_test = upsampled_decision_tree.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_decision_tree_performance_test = model_performance_evaluation(y_test, upsampled_decision_tree_y_hat_test)
upsampled_decision_tree_performance_test['model'] = ['upsampled_decision_tree'] * 5
upsampled_decision_tree_performance_test['set'] = ['test'] * 5
print('Upsampled Decision Tree Model Performance on Test Data: ')
display(upsampled_decision_tree_performance_test)

Upsampled Decision Tree Model Performance on Test Data:

##################################
# Creating an instance of the 
# Random Forest model
##################################
random_forest = RandomForestClassifier()

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'n_estimators': [100,150,200],
    'max_features':['sqrt', 'log2'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
upsampled_random_forest = GridSearchCV(estimator = random_forest, 
                                       param_grid = hyperparameter_grid,
                                       n_jobs = -1,
                                       scoring='f1')

##################################
# Fitting the upsampled Random Forest model
##################################
upsampled_random_forest.fit(X_train_smote, y_train_smote)

##################################
# Determining the optimal hyperparameter
# for the Random Forest model
##################################
upsampled_random_forest.best_score_ 
upsampled_random_forest.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 150,
 'random_state': 88888888}

##################################
# Evaluating the upsampled Random Forest model
# on the train set
##################################
upsampled_random_forest_y_hat_train = upsampled_random_forest.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_random_forest_performance_train = model_performance_evaluation(y_train, upsampled_random_forest_y_hat_train)
upsampled_random_forest_performance_train['model'] = ['upsampled_random_forest'] * 5
upsampled_random_forest_performance_train['set'] = ['train'] * 5
print('Upsampled Random Forest Model Performance on Train Data: ')
display(upsampled_random_forest_performance_train)

Upsampled Random Forest Model Performance on Train Data:

##################################
# Evaluating the upsampled Random Forest model
# on the test set
##################################
upsampled_random_forest_y_hat_test = upsampled_random_forest.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_random_forest_performance_test = model_performance_evaluation(y_test, upsampled_random_forest_y_hat_test)
upsampled_random_forest_performance_test['model'] = ['upsampled_random_forest'] * 5
upsampled_random_forest_performance_test['set'] = ['test'] * 5
print('Upsampled Random Forest Model Performance on Test Data: ')
display(upsampled_random_forest_performance_test)

Upsampled Random Forest Model Performance on Test Data:

##################################
# Creating an instance of the 
# Support Vector Machine model
##################################
support_vector_machine = SVC()

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
upsampled_support_vector_machine = GridSearchCV(estimator = support_vector_machine, 
                                                param_grid = hyperparameter_grid,
                                                n_jobs = -1,
                                                scoring='f1')

##################################
# Fitting the upsampled Support Vector Machine model
##################################
upsampled_support_vector_machine.fit(X_train_smote, y_train_smote)

##################################
# Determining the optimal hyperparameter
# for the Support Vector Machine model
##################################
upsampled_support_vector_machine.best_score_ 
upsampled_support_vector_machine.best_params_

{'C': 1.0, 'class_weight': None, 'kernel': 'linear', 'random_state': 88888888}

##################################
# Evaluating the upsampled Support Vector Machine model
# on the train set
##################################
upsampled_support_vector_machine_y_hat_train = upsampled_support_vector_machine.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_support_vector_machine_performance_train = model_performance_evaluation(y_train, upsampled_support_vector_machine_y_hat_train)
upsampled_support_vector_machine_performance_train['model'] = ['upsampled_support_vector_machine'] * 5
upsampled_support_vector_machine_performance_train['set'] = ['train'] * 5
print('Upsampled Support Vector Machine Model Performance on Train Data: ')
display(upsampled_support_vector_machine_performance_train)

Upsampled Support Vector Machine Model Performance on Train Data:

##################################
# Evaluating the upsampled Support Vector Machine model
# on the test set
##################################
upsampled_support_vector_machine_y_hat_test = upsampled_support_vector_machine.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
upsampled_support_vector_machine_performance_test = model_performance_evaluation(y_test, upsampled_support_vector_machine_y_hat_test)
upsampled_support_vector_machine_performance_test['model'] = ['upsampled_support_vector_machine'] * 5
upsampled_support_vector_machine_performance_test['set'] = ['test'] * 5
print('Upsampled Support Vector Machine Model Performance on Test Data: ')
display(upsampled_support_vector_machine_performance_test)

Upsampled Support Vector Machine Model Performance on Test Data:

##################################
# Consolidating relevant numeric columns
# and encoded categorical columns
# after hypothesis testing
##################################
cancer_rate_premodelling = cancer_rate_preprocessed_all.drop(['AGRLND','POPDEN','GHGEMI','POPGRO','FORARE','HDICAT_H','HDICAT_M','HDICAT_L'], axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 9)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

URBPOP        float64
LIFEXP        float64
TUBINC        float64
DTHCMD        float64
CO2EMI        float64
GDPCAP        float64
EPISCO        float64
CANRAT       category
HDICAT_VH        bool
dtype: object

##################################
# Gathering the pairplot for all variables
##################################
cancer_rate_predictor_pair_plot = sns.pairplot(cancer_rate_premodelling,
                                               kind='reg',
                                               markers=["o", "s"],
                                               plot_kws={'scatter_kws': {'alpha': 0.3}},
                                               hue='CANRAT');
sns.move_legend(cancer_rate_predictor_pair_plot, 
                "lower center",
                bbox_to_anchor=(.5, 1), ncol=2, title='CANRAT', frameon=False)
plt.show()

##################################
# Separating the target 
# and predictor columns
##################################
X = cancer_rate_premodelling.drop('CANRAT', axis = 1)
y = cancer_rate_premodelling['CANRAT'].cat.codes

##################################
# Formulating the train and test data
# using a 70-30 ratio
##################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 88888888, stratify=y)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_train.shape)

Dataset Dimensions:

(114, 8)

##################################
# Validating the class distribution of the train dataset
##################################
y_train.value_counts(normalize = True)

0    0.745614
1    0.254386
Name: proportion, dtype: float64

##################################
# Initiating an oversampling instance
# on the train data using
# Condense Nearest Neighbors
##################################
cnn = CondensedNearestNeighbour(random_state = 88888888, n_neighbors=3)
X_train_cnn, y_train_cnn = cnn.fit_resample(X_train,y_train)

##################################
# Performing a general exploration of the overampled train dataset
##################################
print('Dataset Dimensions: ')
display(X_train_cnn.shape)

Dataset Dimensions:

(50, 8)

##################################
# Validating the class distribution of the overampled train dataset
##################################
y_train_cnn.value_counts(normalize = True)

1    0.58
0    0.42
Name: proportion, dtype: float64

##################################
# Performing a general exploration of the test dataset
##################################
print('Dataset Dimensions: ')
display(X_test.shape)

Dataset Dimensions:

(49, 8)

##################################
# Validating the class distribution of the test dataset
##################################
y_test.value_counts(normalize = True)

0    0.755102
1    0.244898
Name: proportion, dtype: float64

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['Accuracy','Precision','Recall','F1','AUROC']
    metric_value = [accuracy_score(y_true, y_pred),
                   precision_score(y_true, y_pred),
                   recall_score(y_true, y_pred),
                   f1_score(y_true, y_pred),
                   roc_auc_score(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Creating an instance of the 
# Logistic Regression model
##################################
logistic_regression = LogisticRegression()

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear','saga'],
    'class_weight': [None],
    'max_iter': [500],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Logistic Regression model
##################################
downsampled_logistic_regression = GridSearchCV(estimator = logistic_regression, 
                                               param_grid = hyperparameter_grid,
                                               n_jobs = -1,
                                               scoring='f1')

##################################
# Fitting the downsampled Logistic Regression model
##################################
downsampled_logistic_regression.fit(X_train_cnn, y_train_cnn)

##################################
# Determining the optimal hyperparameter
# for the Logistic Regression model
##################################
downsampled_logistic_regression.best_score_ 
downsampled_logistic_regression.best_params_

{'C': 1.0,
 'class_weight': None,
 'max_iter': 500,
 'penalty': 'l1',
 'random_state': 88888888,
 'solver': 'liblinear'}

##################################
# Evaluating the downsampled Logistic Regression model
# on the train set
##################################
downsampled_logistic_regression_y_hat_train = downsampled_logistic_regression.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_logistic_regression_performance_train = model_performance_evaluation(y_train, downsampled_logistic_regression_y_hat_train)
downsampled_logistic_regression_performance_train['model'] = ['downsampled_logistic_regression'] * 5
downsampled_logistic_regression_performance_train['set'] = ['train'] * 5
print('Downsampled Logistic Regression Model Performance on Train Data: ')
display(downsampled_logistic_regression_performance_train)

Downsampled Logistic Regression Model Performance on Train Data:

##################################
# Evaluating the downsampled Logistic Regression model
# on the test set
##################################
downsampled_logistic_regression_y_hat_test = downsampled_logistic_regression.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_logistic_regression_performance_test = model_performance_evaluation(y_test, downsampled_logistic_regression_y_hat_test)
downsampled_logistic_regression_performance_test['model'] = ['downsampled_logistic_regression'] * 5
downsampled_logistic_regression_performance_test['set'] = ['test'] * 5
print('Downsampled Logistic Regression Model Performance on Test Data: ')
display(downsampled_logistic_regression_performance_test)

Downsampled Logistic Regression Model Performance on Test Data:

##################################
# Creating an instance of the 
# Decision Tree model
##################################
decision_tree = DecisionTreeClassifier()

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Decision Tree model
##################################
downsampled_decision_tree = GridSearchCV(estimator = decision_tree, 
                                         param_grid = hyperparameter_grid,
                                         n_jobs = -1,
                                         scoring='f1')

##################################
# Fitting the downsampled Decision Tree model
##################################
downsampled_decision_tree.fit(X_train_cnn, y_train_cnn)

##################################
# Determining the optimal hyperparameter
# for the Decision Tree model
##################################
downsampled_decision_tree.best_score_ 
downsampled_decision_tree.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 5,
 'random_state': 88888888}

##################################
# Evaluating the downsampled Decision Tree model
# on the train set
##################################
downsampled_decision_tree_y_hat_train = downsampled_decision_tree.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_decision_tree_performance_train = model_performance_evaluation(y_train, downsampled_decision_tree_y_hat_train)
downsampled_decision_tree_performance_train['model'] = ['downsampled_decision_tree'] * 5
downsampled_decision_tree_performance_train['set'] = ['train'] * 5
print('Downsampled Decision Tree Model Performance on Train Data: ')
display(downsampled_decision_tree_performance_train)

Downsampled Decision Tree Model Performance on Train Data:

##################################
# Evaluating the downsampled Decision Tree model
# on the test set
##################################
downsampled_decision_tree_y_hat_test = downsampled_decision_tree.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_decision_tree_performance_test = model_performance_evaluation(y_test, downsampled_decision_tree_y_hat_test)
downsampled_decision_tree_performance_test['model'] = ['downsampled_decision_tree'] * 5
downsampled_decision_tree_performance_test['set'] = ['test'] * 5
print('Downsampled Decision Tree Model Performance on Test Data: ')
display(downsampled_decision_tree_performance_test)

Downsampled Decision Tree Model Performance on Test Data:

##################################
# Creating an instance of the 
# Random Forest model
##################################
random_forest = RandomForestClassifier()

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
hyperparameter_grid = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth': [3,5,7],
    'min_samples_leaf': [3,5,10],
    'n_estimators': [100,150,200],
    'max_features':['sqrt', 'log2'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Random Forest model
##################################
downsampled_random_forest = GridSearchCV(estimator = random_forest, 
                                         param_grid = hyperparameter_grid,
                                         n_jobs = -1,
                                         scoring='f1')

##################################
# Fitting the downsampled Random Forest model
##################################
downsampled_random_forest.fit(X_train_cnn, y_train_cnn)

##################################
# Determining the optimal hyperparameter
# for the Random Forest model
##################################
downsampled_random_forest.best_score_ 
downsampled_random_forest.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'n_estimators': 100,
 'random_state': 88888888}

##################################
# Evaluating the downsampled Random Forest model
# on the train set
##################################
downsampled_random_forest_y_hat_train = downsampled_random_forest.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_random_forest_performance_train = model_performance_evaluation(y_train, downsampled_random_forest_y_hat_train)
downsampled_random_forest_performance_train['model'] = ['downsampled_random_forest'] * 5
downsampled_random_forest_performance_train['set'] = ['train'] * 5
print('Downsampled Random Forest Model Performance on Train Data: ')
display(downsampled_random_forest_performance_train)

Downsampled Random Forest Model Performance on Train Data:

##################################
# Evaluating the downsampled Random Forest model
# on the test set
##################################
downsampled_random_forest_y_hat_test = downsampled_random_forest.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_random_forest_performance_test = model_performance_evaluation(y_test, downsampled_random_forest_y_hat_test)
downsampled_random_forest_performance_test['model'] = ['downsampled_random_forest'] * 5
downsampled_random_forest_performance_test['set'] = ['test'] * 5
print('Downsampled Random Forest Model Performance on Test Data: ')
display(downsampled_random_forest_performance_test)

Downsampled Random Forest Model Performance on Test Data:

##################################
# Creating an instance of the 
# Support Vector Machine model
##################################
support_vector_machine = SVC()

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
hyperparameter_grid = {
    'C': [1.0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': [None],
    'random_state': [88888888]}

##################################
# Defining the hyperparameters for the
# Support Vector Machine model
##################################
downsampled_support_vector_machine = GridSearchCV(estimator = support_vector_machine, 
                                                  param_grid = hyperparameter_grid,
                                                  n_jobs = -1,
                                                  scoring='f1')

##################################
# Fitting the downsampled Support Vector Machine model
##################################
downsampled_support_vector_machine.fit(X_train_cnn, y_train_cnn)

##################################
# Determining the optimal hyperparameter
# for the Support Vector Machine model
##################################
downsampled_support_vector_machine.best_score_ 
downsampled_support_vector_machine.best_params_

{'C': 1.0, 'class_weight': None, 'kernel': 'linear', 'random_state': 88888888}

##################################
# Evaluating the downsampled Support Vector Machine model
# on the train set
##################################
downsampled_support_vector_machine_y_hat_train = downsampled_support_vector_machine.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_support_vector_machine_performance_train = model_performance_evaluation(y_train, downsampled_support_vector_machine_y_hat_train)
downsampled_support_vector_machine_performance_train['model'] = ['downsampled_support_vector_machine'] * 5
downsampled_support_vector_machine_performance_train['set'] = ['train'] * 5
print('Downsampled Support Vector Machine Model Performance on Train Data: ')
display(downsampled_support_vector_machine_performance_train)

Downsampled Support Vector Machine Model Performance on Train Data:

##################################
# Evaluating the downsampled Support Vector Machine model
# on the test set
##################################
downsampled_support_vector_machine_y_hat_test = downsampled_support_vector_machine.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
downsampled_support_vector_machine_performance_test = model_performance_evaluation(y_test, downsampled_support_vector_machine_y_hat_test)
downsampled_support_vector_machine_performance_test['model'] = ['downsampled_support_vector_machine'] * 5
downsampled_support_vector_machine_performance_test['set'] = ['test'] * 5
print('Downsampled Support Vector Machine Model Performance on Test Data: ')
display(downsampled_support_vector_machine_performance_test)

Downsampled Support Vector Machine Model Performance on Test Data:

##################################
# Consolidating all the
# Logistic Regression
# model performance measures
##################################
logistic_regression_performance_comparison = pd.concat([optimal_logistic_regression_performance_train, 
                                                        optimal_logistic_regression_performance_test,
                                                        weighted_logistic_regression_performance_train, 
                                                        weighted_logistic_regression_performance_test,
                                                        upsampled_logistic_regression_performance_train, 
                                                        upsampled_logistic_regression_performance_test,
                                                        downsampled_logistic_regression_performance_train, 
                                                        downsampled_logistic_regression_performance_test], 
                                                       ignore_index=True)
print('Consolidated Logistic Regression Model Performance on Train and Test Data: ')
display(logistic_regression_performance_comparison)

Consolidated Logistic Regression Model Performance on Train and Test Data:

##################################
# Consolidating all the F1 score
# model performance measures
##################################
logistic_regression_performance_comparison_F1 = logistic_regression_performance_comparison[logistic_regression_performance_comparison['metric_name']=='F1']
logistic_regression_performance_comparison_F1_train = logistic_regression_performance_comparison_F1[logistic_regression_performance_comparison_F1['set']=='train'].loc[:,"metric_value"]
logistic_regression_performance_comparison_F1_test = logistic_regression_performance_comparison_F1[logistic_regression_performance_comparison_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between train and test sets
##################################
logistic_regression_performance_comparison_F1_plot = pd.DataFrame({'train': logistic_regression_performance_comparison_F1_train.values,
                                                                   'test': logistic_regression_performance_comparison_F1_test.values},
                                                                  index=logistic_regression_performance_comparison_F1['model'].unique())
logistic_regression_performance_comparison_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train and test sets
##################################
logistic_regression_performance_comparison_F1_plot = logistic_regression_performance_comparison_F1_plot.plot.barh(figsize=(10, 6))
logistic_regression_performance_comparison_F1_plot.set_xlim(0.00,1.00)
logistic_regression_performance_comparison_F1_plot.set_title("Model Comparison by F1 Score Performance on Test Data")
logistic_regression_performance_comparison_F1_plot.set_xlabel("F1 Score Performance")
logistic_regression_performance_comparison_F1_plot.set_ylabel("Logistic Regression Model")
logistic_regression_performance_comparison_F1_plot.grid(False)
logistic_regression_performance_comparison_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in logistic_regression_performance_comparison_F1_plot.containers:
    logistic_regression_performance_comparison_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Plotting the confusion matrices
# for all the Logistic Regression models
##################################
classifiers = {"optimal_logistic_regression": optimal_logistic_regression,
               "weighted_logistic_regression": weighted_logistic_regression,
               "upsampled_logistic_regression": upsampled_logistic_regression,
               "downsampled_logistic_regression": downsampled_logistic_regression}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for i, (key, classifier) in enumerate(classifiers.items()):
    y_pred = classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cf_matrix)
    disp.plot(ax=axes[i], xticks_rotation=0)
    disp.ax_.grid(False)
    disp.ax_.set_title(key)
    disp.im_.colorbar.remove()

fig.colorbar(disp.im_, ax=axes)
plt.show()

##################################
# Consolidating all the
# Decision Tree
# model performance measures
##################################
decision_tree_performance_comparison = pd.concat([optimal_decision_tree_performance_train, 
                                                  optimal_decision_tree_performance_test,
                                                  weighted_decision_tree_performance_train, 
                                                  weighted_decision_tree_performance_test,
                                                  upsampled_decision_tree_performance_train, 
                                                  upsampled_decision_tree_performance_test,
                                                  downsampled_decision_tree_performance_train, 
                                                  downsampled_decision_tree_performance_test], 
                                                 ignore_index=True)
print('Consolidated Decision Tree Model Performance on Train and Test Data: ')
display(decision_tree_performance_comparison)

Consolidated Decision Tree Model Performance on Train and Test Data:

##################################
# Consolidating all the F1 score
# model performance measures
##################################
decision_tree_performance_comparison_F1 = decision_tree_performance_comparison[decision_tree_performance_comparison['metric_name']=='F1']
decision_tree_performance_comparison_F1_train = decision_tree_performance_comparison_F1[decision_tree_performance_comparison_F1['set']=='train'].loc[:,"metric_value"]
decision_tree_performance_comparison_F1_test = decision_tree_performance_comparison_F1[decision_tree_performance_comparison_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between train and test sets
##################################
decision_tree_performance_comparison_F1_plot = pd.DataFrame({'train': decision_tree_performance_comparison_F1_train.values,
                                                             'test': decision_tree_performance_comparison_F1_test.values},
                                                            index=decision_tree_performance_comparison_F1['model'].unique())
decision_tree_performance_comparison_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train and test sets
##################################
decision_tree_performance_comparison_F1_plot = decision_tree_performance_comparison_F1_plot.plot.barh(figsize=(10, 6))
decision_tree_performance_comparison_F1_plot.set_xlim(0.00,1.00)
decision_tree_performance_comparison_F1_plot.set_title("Model Comparison by F1 Score Performance on Test Data")
decision_tree_performance_comparison_F1_plot.set_xlabel("F1 Score Performance")
decision_tree_performance_comparison_F1_plot.set_ylabel("Decision Tree Model")
decision_tree_performance_comparison_F1_plot.grid(False)
decision_tree_performance_comparison_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in decision_tree_performance_comparison_F1_plot.containers:
    decision_tree_performance_comparison_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Plotting the confusion matrices
# for all the Decision Tree models
##################################
classifiers = {"optimal_decision_tree": optimal_decision_tree,
               "weighted_decision_tree": weighted_decision_tree,
               "upsampled_decision_tree": upsampled_decision_tree,
               "downsampled_decision_tree": downsampled_decision_tree}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for i, (key, classifier) in enumerate(classifiers.items()):
    y_pred = classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cf_matrix)
    disp.plot(ax=axes[i], xticks_rotation=0)
    disp.ax_.grid(False)
    disp.ax_.set_title(key)
    disp.im_.colorbar.remove()

fig.colorbar(disp.im_, ax=axes)
plt.show()

##################################
# Consolidating all the
# Random Forest
# model performance measures
##################################
random_forest_performance_comparison = pd.concat([optimal_random_forest_performance_train, 
                                                  optimal_random_forest_performance_test,
                                                  weighted_random_forest_performance_train, 
                                                  weighted_random_forest_performance_test,
                                                  upsampled_random_forest_performance_train, 
                                                  upsampled_random_forest_performance_test,
                                                  downsampled_random_forest_performance_train, 
                                                  downsampled_random_forest_performance_test], 
                                                 ignore_index=True)
print('Consolidated Random Forest Model Performance on Train and Test Data: ')
display(random_forest_performance_comparison)

Consolidated Random Forest Model Performance on Train and Test Data:

##################################
# Consolidating all the F1 score
# model performance measures
##################################
random_forest_performance_comparison_F1 = random_forest_performance_comparison[random_forest_performance_comparison['metric_name']=='F1']
random_forest_performance_comparison_F1_train = random_forest_performance_comparison_F1[random_forest_performance_comparison_F1['set']=='train'].loc[:,"metric_value"]
random_forest_performance_comparison_F1_test = random_forest_performance_comparison_F1[random_forest_performance_comparison_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between train and test sets
##################################
random_forest_performance_comparison_F1_plot = pd.DataFrame({'train': random_forest_performance_comparison_F1_train.values,
                                                             'test': random_forest_performance_comparison_F1_test.values},
                                                            index=random_forest_performance_comparison_F1['model'].unique())
random_forest_performance_comparison_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train and test sets
##################################
random_forest_performance_comparison_F1_plot = random_forest_performance_comparison_F1_plot.plot.barh(figsize=(10, 6))
random_forest_performance_comparison_F1_plot.set_xlim(0.00,1.00)
random_forest_performance_comparison_F1_plot.set_title("Model Comparison by F1 Score Performance on Test Data")
random_forest_performance_comparison_F1_plot.set_xlabel("F1 Score Performance")
random_forest_performance_comparison_F1_plot.set_ylabel("Random Forest Model")
random_forest_performance_comparison_F1_plot.grid(False)
random_forest_performance_comparison_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in random_forest_performance_comparison_F1_plot.containers:
    random_forest_performance_comparison_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Plotting the confusion matrices
# for all the Random Forest models
##################################
classifiers = {"optimal_random_forest": optimal_random_forest,
               "weighted_random_forest": weighted_random_forest,
               "upsampled_random_forest": upsampled_random_forest,
               "downsampled_random_forest": downsampled_random_forest}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for i, (key, classifier) in enumerate(classifiers.items()):
    y_pred = classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cf_matrix)
    disp.plot(ax=axes[i], xticks_rotation=0)
    disp.ax_.grid(False)
    disp.ax_.set_title(key)
    disp.im_.colorbar.remove()

fig.colorbar(disp.im_, ax=axes)
plt.show()

##################################
# Consolidating all the
# Support Vector Machine
# model performance measures
##################################
support_vector_machine_performance_comparison = pd.concat([optimal_support_vector_machine_performance_train, 
                                                  optimal_support_vector_machine_performance_test,
                                                  weighted_support_vector_machine_performance_train, 
                                                  weighted_support_vector_machine_performance_test,
                                                  upsampled_support_vector_machine_performance_train, 
                                                  upsampled_support_vector_machine_performance_test,
                                                  downsampled_support_vector_machine_performance_train, 
                                                  downsampled_support_vector_machine_performance_test], 
                                                 ignore_index=True)
print('Consolidated Support Vector Machine Model Performance on Train and Test Data: ')
display(support_vector_machine_performance_comparison)

Consolidated Support Vector Machine Model Performance on Train and Test Data:

##################################
# Consolidating all the F1 score
# model performance measures
##################################
support_vector_machine_performance_comparison_F1 = support_vector_machine_performance_comparison[support_vector_machine_performance_comparison['metric_name']=='F1']
support_vector_machine_performance_comparison_F1_train = support_vector_machine_performance_comparison_F1[support_vector_machine_performance_comparison_F1['set']=='train'].loc[:,"metric_value"]
support_vector_machine_performance_comparison_F1_test = support_vector_machine_performance_comparison_F1[support_vector_machine_performance_comparison_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between train and test sets
##################################
support_vector_machine_performance_comparison_F1_plot = pd.DataFrame({'train': support_vector_machine_performance_comparison_F1_train.values,
                                                                      'test': support_vector_machine_performance_comparison_F1_test.values},
                                                                     index=support_vector_machine_performance_comparison_F1['model'].unique())
support_vector_machine_performance_comparison_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train and test sets
##################################
support_vector_machine_performance_comparison_F1_plot = support_vector_machine_performance_comparison_F1_plot.plot.barh(figsize=(10, 6))
support_vector_machine_performance_comparison_F1_plot.set_xlim(0.00,1.00)
support_vector_machine_performance_comparison_F1_plot.set_title("Model Comparison by F1 Score Performance on Test Data")
support_vector_machine_performance_comparison_F1_plot.set_xlabel("F1 Score Performance")
support_vector_machine_performance_comparison_F1_plot.set_ylabel("Support Vector Machine Model")
support_vector_machine_performance_comparison_F1_plot.grid(False)
support_vector_machine_performance_comparison_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in support_vector_machine_performance_comparison_F1_plot.containers:
    support_vector_machine_performance_comparison_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Plotting the confusion matrices
# for all the Support Vector Machine models
##################################
classifiers = {"optimal_support_vector_machine": optimal_support_vector_machine,
               "weighted_support_vector_machine": weighted_support_vector_machine,
               "upsampled_support_vector_machine": upsampled_support_vector_machine,
               "downsampled_support_vector_machine": downsampled_support_vector_machine}

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for i, (key, classifier) in enumerate(classifiers.items()):
    y_pred = classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cf_matrix)
    disp.plot(ax=axes[i], xticks_rotation=0)
    disp.ax_.grid(False)
    disp.ax_.set_title(key)
    disp.im_.colorbar.remove()

fig.colorbar(disp.im_, ax=axes)
plt.show()

##################################
# Formulating the base learners
# using the optimal hyperparameters
# for the upsampled models
##################################
base_learners = [('LR', LogisticRegression(C=1.0,
                                          class_weight=None,
                                          max_iter=500,
                                          penalty='l1',
                                          random_state=88888888,
                                          solver='saga')),
                ('DT', DecisionTreeClassifier(class_weight=None,
                                              criterion='entropy',
                                              max_depth=3,
                                              min_samples_leaf=5,
                                              random_state=88888888)),
                ('RF', RandomForestClassifier(class_weight=None,
                                              criterion='entropy',
                                              max_depth=7,
                                              max_features='sqrt',
                                              min_samples_leaf=3,
                                              n_estimators=100,
                                              random_state=88888888)),
               ('SVM', SVC(class_weight=None,
                           C=1.0,
                           kernel='linear',
                           random_state=88888888))]

##################################
# Formulating the meta learner
# using default hyperparameters
##################################
meta_learner = LogisticRegression(C=1.0,
                                  class_weight=None,
                                  max_iter=500,
                                  random_state=88888888)

##################################
# Formulating the stacked model
# using the base and meta learners
##################################
stacked_logistic_regression = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)

##################################
# Fitting the meta Logistic Regression model
##################################
stacked_logistic_regression.fit(X_train_smote, y_train_smote)

StackingClassifier(estimators=[('LR',
                                LogisticRegression(max_iter=500, penalty='l1',
                                                   random_state=88888888,
                                                   solver='saga')),
                               ('DT',
                                DecisionTreeClassifier(criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=5,
                                                       random_state=88888888)),
                               ('RF',
                                RandomForestClassifier(criterion='entropy',
                                                       max_depth=7,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('SVM',
                                SVC(kernel='linear', random_state=88888888))],
                   final_estimator=LogisticRegression(max_iter=500,
                                                      random_state=88888888))

StackingClassifier(estimators=[('LR',
                                LogisticRegression(max_iter=500, penalty='l1',
                                                   random_state=88888888,
                                                   solver='saga')),
                               ('DT',
                                DecisionTreeClassifier(criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=5,
                                                       random_state=88888888)),
                               ('RF',
                                RandomForestClassifier(criterion='entropy',
                                                       max_depth=7,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('SVM',
                                SVC(kernel='linear', random_state=88888888))],
                   final_estimator=LogisticRegression(max_iter=500,
                                                      random_state=88888888))

LogisticRegression(max_iter=500, penalty='l1', random_state=88888888,
                   solver='saga')

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5,
                       random_state=88888888)

RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf=3,
                       random_state=88888888)

SVC(kernel='linear', random_state=88888888)

LogisticRegression(max_iter=500, random_state=88888888)

##################################
# Evaluating the stacked Logistic Regression model
# on the train set
##################################
stacked_logistic_regression_y_hat_train = stacked_logistic_regression.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
stacked_logistic_regression_performance_train = model_performance_evaluation(y_train, stacked_logistic_regression_y_hat_train)
stacked_logistic_regression_performance_train['model'] = ['stacked_logistic_regression'] * 5
stacked_logistic_regression_performance_train['set'] = ['train'] * 5
print('Stacked Logistic Regression Model Performance on Train Data: ')
display(stacked_logistic_regression_performance_train)

Stacked Logistic Regression Model Performance on Train Data:

##################################
# Evaluating the stacked Logistic Regression model
# on the test set
##################################
stacked_logistic_regression_y_hat_test = stacked_logistic_regression.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
stacked_logistic_regression_performance_test = model_performance_evaluation(y_test, stacked_logistic_regression_y_hat_test)
stacked_logistic_regression_performance_test['model'] = ['stacked_logistic_regression'] * 5
stacked_logistic_regression_performance_test['set'] = ['test'] * 5
print('Stacked Logistic Regression Model Performance on Test Data: ')
display(stacked_logistic_regression_performance_test)

Stacked Logistic Regression Model Performance on Test Data:

##################################
# Consolidating all the
# base and meta-learner
# model performance measures
##################################
base_meta_learner_performance_comparison = pd.concat([weighted_logistic_regression_performance_train, 
                                                      weighted_logistic_regression_performance_test,
                                                      upsampled_logistic_regression_performance_train, 
                                                      upsampled_logistic_regression_performance_test,
                                                      upsampled_decision_tree_performance_train, 
                                                      upsampled_decision_tree_performance_test,
                                                      upsampled_random_forest_performance_train, 
                                                      upsampled_random_forest_performance_test,
                                                      upsampled_support_vector_machine_performance_train, 
                                                      upsampled_support_vector_machine_performance_test,
                                                      stacked_logistic_regression_performance_train, 
                                                      stacked_logistic_regression_performance_test], 
                                                     ignore_index=True)
print('Consolidated Base and Meta Learner Model Performance on Train and Test Data: ')
display(base_meta_learner_performance_comparison)

Consolidated Base and Meta Learner Model Performance on Train and Test Data:

##################################
# Consolidating all the F1 score
# model performance measures
##################################
base_meta_learner_performance_comparison_F1 = base_meta_learner_performance_comparison[base_meta_learner_performance_comparison['metric_name']=='F1']
base_meta_learner_performance_comparison_F1_train = base_meta_learner_performance_comparison_F1[base_meta_learner_performance_comparison_F1['set']=='train'].loc[:,"metric_value"]
base_meta_learner_performance_comparison_F1_test = base_meta_learner_performance_comparison_F1[base_meta_learner_performance_comparison_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between train and test sets
##################################
base_meta_learner_performance_comparison_F1_plot = pd.DataFrame({'train': base_meta_learner_performance_comparison_F1_train.values,
                                                                 'test': base_meta_learner_performance_comparison_F1_test.values},
                                                                index=base_meta_learner_performance_comparison_F1['model'].unique())
base_meta_learner_performance_comparison_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train and test sets
##################################
base_meta_learner_performance_comparison_F1_plot = base_meta_learner_performance_comparison_F1_plot.plot.barh(figsize=(10, 6))
base_meta_learner_performance_comparison_F1_plot.set_xlim(0.00,1.00)
base_meta_learner_performance_comparison_F1_plot.set_title("Model Comparison by F1 Score Performance on Test Data")
base_meta_learner_performance_comparison_F1_plot.set_xlabel("F1 Score Performance")
base_meta_learner_performance_comparison_F1_plot.set_ylabel("Base and Meta Learner Model")
base_meta_learner_performance_comparison_F1_plot.grid(False)
base_meta_learner_performance_comparison_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in base_meta_learner_performance_comparison_F1_plot.containers:
    base_meta_learner_performance_comparison_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Consolidating all score
# model performance measures
##################################
base_meta_learner_performance_comparison_Accuracy_test = base_meta_learner_performance_comparison[(base_meta_learner_performance_comparison['set']=='test') & (base_meta_learner_performance_comparison['metric_name']=='Accuracy')].loc[:,"metric_value"]
base_meta_learner_performance_comparison_Precision_test = base_meta_learner_performance_comparison[(base_meta_learner_performance_comparison['set']=='test') & (base_meta_learner_performance_comparison['metric_name']=='Precision')].loc[:,"metric_value"]
base_meta_learner_performance_comparison_Recall_test = base_meta_learner_performance_comparison[(base_meta_learner_performance_comparison['set']=='test') & (base_meta_learner_performance_comparison['metric_name']=='Recall')].loc[:,"metric_value"]
base_meta_learner_performance_comparison_F1_test = base_meta_learner_performance_comparison[(base_meta_learner_performance_comparison['set']=='test') & (base_meta_learner_performance_comparison['metric_name']=='F1')].loc[:,"metric_value"]
base_meta_learner_performance_comparison_AUROC_test = base_meta_learner_performance_comparison[(base_meta_learner_performance_comparison['set']=='test') & (base_meta_learner_performance_comparison['metric_name']=='AUROC')].loc[:,"metric_value"]

##################################
# Combining all the score
# model performance measures
# between train and test sets
##################################
base_meta_learner_performance_comparison_all_plot = pd.DataFrame({'accuracy': base_meta_learner_performance_comparison_Accuracy_test.values,
                                                                  'precision': base_meta_learner_performance_comparison_Precision_test.values,
                                                                  'recall': base_meta_learner_performance_comparison_Recall_test.values,
                                                                  'f1': base_meta_learner_performance_comparison_F1_test.values,
                                                                  'auroc': base_meta_learner_performance_comparison_AUROC_test.values},
                                                                index=base_meta_learner_performance_comparison['model'].unique())
base_meta_learner_performance_comparison_all_plot

##################################
# Plotting all the score
# model performance measures
# between train and test sets
##################################
base_meta_learner_performance_comparison_all_plot = base_meta_learner_performance_comparison_all_plot.plot.barh(figsize=(10, 9),width=0.90)
base_meta_learner_performance_comparison_all_plot.set_xlim(0.00,1.00)
base_meta_learner_performance_comparison_all_plot.set_title("Model Comparison by Score Performance on Test Data")
base_meta_learner_performance_comparison_all_plot.set_xlabel("Score Performance")
base_meta_learner_performance_comparison_all_plot.set_ylabel("Base and Meta Learner Model")
base_meta_learner_performance_comparison_all_plot.grid(False)
base_meta_learner_performance_comparison_all_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in base_meta_learner_performance_comparison_all_plot.containers:
    base_meta_learner_performance_comparison_all_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Plotting the confusion matrices
# for all the Support Vector Machine models
##################################
classifiers = {"upsampled_logistic_regression": upsampled_logistic_regression,
               "upsampled_decision_tree": upsampled_decision_tree,
               "upsampled_random_forest": upsampled_random_forest,
               "upsampled_support_vector_machine": upsampled_support_vector_machine,
               "stacked_logistic_regression": stacked_logistic_regression,
               "weighted_logistic_regression": weighted_logistic_regression,}

fig, axes = plt.subplots(2, 3, figsize=(20, 10))
axes = axes.ravel()
for i, (key, classifier) in enumerate(classifiers.items()):
    y_pred = classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(cf_matrix)
    disp.plot(ax=axes[i], xticks_rotation=0)
    disp.ax_.grid(False)
    disp.ax_.set_title(key)
    disp.im_.colorbar.remove()

fig.colorbar(disp.im_, ax=axes)
plt.show()

##################################
# Reformulating the weighted Logistic Regression model
# as the final classification model
# with the optimal hyperparameters
##################################
final_model = LogisticRegression(C=1.0,
                                 class_weight={0: 0.25, 1: 0.75},
                                 solver='liblinear',
                                 penalty= 'l2',
                                 max_iter=500,
                                 random_state=88888888)
final_model.fit(X_train, y_train)

LogisticRegression(class_weight={0: 0.25, 1: 0.75}, max_iter=500,
                   random_state=88888888, solver='liblinear')

LogisticRegression(class_weight={0: 0.25, 1: 0.75}, max_iter=500,
                   random_state=88888888, solver='liblinear')

##################################
# Gathering the model coefficients
# and the estimated log-odds 
# of the weighted Logistic Regression model
##################################
final_model_coefficient = pd.DataFrame(zip(X_train.columns, 
                                           final_model.coef_[0].tolist(),
                                           np.exp(final_model.coef_)[0].tolist()),
                                       columns=['model_predictor','model_coefficient','odds_ratio'])
display(final_model_coefficient)

##################################
# Setting up the primary explainer interface
# for the SHAP library using the 
# weighted Logistic Regression model
##################################
final_model_explainer = shap.Explainer(final_model, X_train)

##################################
# Gathering up the SHAP values
# for the train set
##################################
final_model_train_shap_values = final_model_explainer(X_train)

##################################
# Gathering up the SHAP values
# for the test set
##################################
final_model_test_shap_values = final_model_explainer(X_test)

##################################
# Formulating the bar plot
# of the SHAP values using the train set
# to estimate global feature importance
##################################
shap.plots.bar(final_model_train_shap_values, show=False)
plt.xlim([0, 1])
plt.show()

##################################
# Formulating the bar plot
# of the SHAP values using the test set
# to estimate global feature importance
##################################
shap.plots.bar(final_model_test_shap_values, show=False)
plt.xlim([0, 1])
plt.show()

##################################
# Converting the SHAP values 
# to float data types
##################################
final_model_train_shap_values.values = final_model_train_shap_values.values.astype(float)

##################################
# Formulating the beeswarm plot
# of the SHAP values using the train set
# to estimate the feature impact
# on model predictions
##################################
shap.plots.beeswarm(final_model_train_shap_values, show=False)
plt.gcf().set_size_inches(10, 6)
plt.xlim([-3, 3])
plt.show()

##################################
# Converting the SHAP values 
# to float data types
##################################
final_model_test_shap_values.values = final_model_test_shap_values.values.astype(float)

##################################
# Formulating the beeswarm plot
# of the SHAP values using the test set
# to estimate the feature impact
# on model predictions
##################################
shap.plots.beeswarm(final_model_test_shap_values, show=False, plot_size=(10, 6))
plt.xlim([-3, 3])
plt.show()

##################################
# Formulating the heatmap plot
# of the SHAP values using the train set
# to estimate the observation impact
# on model predictions
##################################
shap.plots.heatmap(final_model_train_shap_values)

<Axes: xlabel='Instances'>

##################################
# Formulating the heatmap plot
# of the SHAP values using the train set
# to estimate the observation impact
# on model predictions
##################################
shap.plots.heatmap(final_model_test_shap_values)

<Axes: xlabel='Instances'>

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="GDPCAP")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="DTHCMD")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="LIFEXP")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="TUBINC")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="CO2EMI")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="HDICAT_VH")

##################################
# Formulating the dependence plot
# of the SHAP values using the train set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_train), X_train, interaction_index="URBPOP")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="GDPCAP")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="DTHCMD")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="LIFEXP")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="TUBINC")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="CO2EMI")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="HDICAT_VH")

##################################
# Formulating the dependence plot
# of the SHAP values using the test set
# for the most important feature
# as evaluated to the rest of the features
##################################
shap.dependence_plot('EPISCO', final_model_explainer.shap_values(X_test), X_test, interaction_index="URBPOP")

from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))

	COUNTRY	CANRAT	GDPPER	URBPOP	PATRES	RNDGDP	POPGRO	LIFEXP	TUBINC	DTHCMD	...	RELOUT	METEMI	FORARE	CO2EMI	PM2EXP	POPDEN	ENRTER	GDPCAP	HDICAT	EPISCO
0	Australia	High	98380.63601	86.241	2368.0	NaN	1.235701	83.200000	7.2	4.941054	...	13.637841	131484.763200	17.421315	14.772658	24.893584	3.335312	110.139221	51722.06900	VH	60.1
1	New Zealand	High	77541.76438	86.699	348.0	NaN	2.204789	82.256098	7.2	4.354730	...	80.081439	32241.937000	37.570126	6.160799	NaN	19.331586	75.734833	41760.59478	VH	56.7
2	Ireland	High	198405.87500	63.653	75.0	1.23244	1.029111	82.556098	5.3	5.684596	...	27.965408	15252.824630	11.351720	6.768228	0.274092	72.367281	74.680313	85420.19086	VH	57.4
3	United States	High	130941.63690	82.664	269586.0	3.42287	0.964348	76.980488	2.3	5.302060	...	13.228593	748241.402900	33.866926	13.032828	3.343170	36.240985	87.567657	63528.63430	VH	51.1
4	Denmark	High	113300.60110	88.116	1261.0	2.96873	0.291641	81.602439	4.1	6.826140	...	65.505925	7778.773921	15.711000	4.691237	56.914456	145.785100	82.664330	60915.42440	VH	77.9

	count	mean	std	min	25%	50%	75%	max
GDPPER	165.0	45284.424283	3.941794e+04	1718.804896	13545.254510	34024.900890	66778.416050	2.346469e+05
URBPOP	174.0	59.788121	2.280640e+01	13.345000	42.432750	61.701500	79.186500	1.000000e+02
PATRES	108.0	20607.388889	1.340683e+05	1.000000	35.250000	244.500000	1297.750000	1.344817e+06
RNDGDP	74.0	1.197474	1.189956e+00	0.039770	0.256372	0.873660	1.608842	5.354510e+00
POPGRO	174.0	1.127028	1.197718e+00	-2.079337	0.236900	1.179959	2.031154	3.727101e+00
LIFEXP	174.0	71.746113	7.606209e+00	52.777000	65.907500	72.464610	77.523500	8.456000e+01
TUBINC	174.0	105.005862	1.367229e+02	0.770000	12.000000	44.500000	147.750000	5.920000e+02
DTHCMD	170.0	21.260521	1.927333e+01	1.283611	6.078009	12.456279	36.980457	6.520789e+01
AGRLND	174.0	38.793456	2.171551e+01	0.512821	20.130276	40.386649	54.013754	8.084112e+01
GHGEMI	170.0	259582.709895	1.118550e+06	179.725150	12527.487367	41009.275980	116482.578575	1.294287e+07
RELOUT	153.0	39.760036	3.191492e+01	0.000296	10.582691	32.381668	63.011450	1.000000e+02
METEMI	170.0	47876.133575	1.346611e+05	11.596147	3662.884908	11118.976025	32368.909040	1.186285e+06
FORARE	173.0	32.218177	2.312001e+01	0.008078	11.604388	31.509048	49.071780	9.741212e+01
CO2EMI	170.0	3.751097	4.606479e+00	0.032585	0.631924	2.298368	4.823496	3.172684e+01
PM2EXP	167.0	91.940595	2.206003e+01	0.274092	99.627134	100.000000	100.000000	1.000000e+02
POPDEN	174.0	200.886765	6.453834e+02	2.115134	27.454539	77.983133	153.993650	7.918951e+03
ENRTER	116.0	49.994997	2.970619e+01	2.432581	22.107195	53.392460	71.057467	1.433107e+02
GDPCAP	170.0	13992.095610	1.957954e+04	216.827417	1870.503029	5348.192875	17421.116227	1.173705e+05
EPISCO	165.0	42.946667	1.249086e+01	18.900000	33.000000	40.900000	50.500000	7.790000e+01

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	Australia	22	1	0.045455
1	New Zealand	22	2	0.090909
2	Ireland	22	0	0.000000
3	United States	22	0	0.000000
4	Denmark	22	0	0.000000
...	...	...	...	...
172	Congo Republic	22	3	0.136364
173	Bhutan	22	2	0.090909
174	Nepal	22	2	0.090909
175	Gambia	22	4	0.181818
176	Niger	22	2	0.090909

	Row.Name	Column.Count	Null.Count	Missing.Rate
35	Guadeloupe	22	20	0.909091
39	Martinique	22	20	0.909091
56	French Guiana	22	20	0.909091
13	New Caledonia	22	11	0.500000
44	French Polynesia	22	11	0.500000
75	Guam	22	11	0.500000
53	Puerto Rico	22	9	0.409091
85	North Korea	22	6	0.272727
168	South Sudan	22	6	0.272727
132	Somalia	22	6	0.272727
117	Libya	22	5	0.227273
73	Venezuela	22	5	0.227273
161	Eritrea	22	5	0.227273
164	Yemen	22	5	0.227273

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
0	GDPPER	1718.804896	45284.424283	34024.900890	2.346469e+05	98380.636010	77541.764380	1	1	1.000000	165	177	0.932203	1.517574	3.471992
1	URBPOP	13.345000	59.788121	61.701500	1.000000e+02	100.000000	86.699000	2	1	2.000000	173	177	0.977401	-0.210702	-0.962847
2	PATRES	1.000000	20607.388889	244.500000	1.344817e+06	6.000000	2.000000	4	3	1.333333	97	177	0.548023	9.284436	91.187178
3	RNDGDP	0.039770	1.197474	0.873660	5.354510e+00	1.232440	3.422870	1	1	1.000000	74	177	0.418079	1.396742	1.695957
4	POPGRO	-2.079337	1.127028	1.179959	3.727101e+00	1.235701	2.204789	1	1	1.000000	174	177	0.983051	-0.195161	-0.423580
5	LIFEXP	52.777000	71.746113	72.464610	8.456000e+01	83.200000	82.256098	1	1	1.000000	174	177	0.983051	-0.357965	-0.649601
6	TUBINC	0.770000	105.005862	44.500000	5.920000e+02	12.000000	4.100000	4	3	1.333333	131	177	0.740113	1.746333	2.429368
7	DTHCMD	1.283611	21.260521	12.456279	6.520789e+01	4.941054	4.354730	1	1	1.000000	170	177	0.960452	0.900509	-0.691541
8	AGRLND	0.512821	38.793456	40.386649	8.084112e+01	46.252480	38.562911	1	1	1.000000	174	177	0.983051	0.074000	-0.926249
9	GHGEMI	179.725150	259582.709895	41009.275980	1.294287e+07	571903.119900	80158.025830	1	1	1.000000	170	177	0.960452	9.496120	101.637308
10	RELOUT	0.000296	39.760036	32.381668	1.000000e+02	100.000000	80.081439	3	1	3.000000	151	177	0.853107	0.501088	-0.981774
11	METEMI	11.596147	47876.133575	11118.976025	1.186285e+06	131484.763200	32241.937000	1	1	1.000000	170	177	0.960452	5.801014	38.661386
12	FORARE	0.008078	32.218177	31.509048	9.741212e+01	17.421315	37.570126	1	1	1.000000	173	177	0.977401	0.519277	-0.322589
13	CO2EMI	0.032585	3.751097	2.298368	3.172684e+01	14.772658	6.160799	1	1	1.000000	170	177	0.960452	2.721552	10.311574
14	PM2EXP	0.274092	91.940595	100.000000	1.000000e+02	100.000000	100.000000	106	2	53.000000	61	177	0.344633	-3.141557	9.032386
15	POPDEN	2.115134	200.886765	77.983133	7.918951e+03	3.335312	19.331586	1	1	1.000000	174	177	0.983051	10.267750	119.995256
16	ENRTER	2.432581	49.994997	53.392460	1.433107e+02	110.139221	75.734833	1	1	1.000000	116	177	0.655367	0.275863	-0.392895
17	GDPCAP	216.827417	13992.095610	5348.192875	1.173705e+05	51722.069000	41760.594780	1	1	1.000000	170	177	0.960452	2.258568	5.938690
18	EPISCO	18.900000	42.946667	40.900000	7.790000e+01	29.600000	43.600000	3	3	1.000000	137	177	0.774011	0.641799	0.035208

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Null.Count	Fill.Rate
0	COUNTRY	object	177	177	0	1.000000
1	CANRAT	category	177	177	0	1.000000
2	GDPPER	float64	177	165	12	0.932203
3	URBPOP	float64	177	174	3	0.983051
4	PATRES	float64	177	108	69	0.610169
5	RNDGDP	float64	177	74	103	0.418079
6	POPGRO	float64	177	174	3	0.983051
7	LIFEXP	float64	177	174	3	0.983051
8	TUBINC	float64	177	174	3	0.983051
9	DTHCMD	float64	177	170	7	0.960452
10	AGRLND	float64	177	174	3	0.983051
11	GHGEMI	float64	177	170	7	0.960452
12	RELOUT	float64	177	153	24	0.864407
13	METEMI	float64	177	170	7	0.960452
14	FORARE	float64	177	173	4	0.977401
15	CO2EMI	float64	177	170	7	0.960452
16	PM2EXP	float64	177	167	10	0.943503
17	POPDEN	float64	177	174	3	0.983051
18	ENRTER	float64	177	116	61	0.655367
19	GDPCAP	float64	177	170	7	0.960452
20	HDICAT	category	177	167	10	0.943503
21	EPISCO	float64	177	165	12	0.932203

	Categorical.Column.Name	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio
0	CANRAT	Low	High	132	45	2.933333	2	177	0.011299
1	HDICAT	VH	H	59	39	1.512821	4	177	0.022599

	Numeric.Column.Name	Skewness	Outlier.Count	Row.Count	Outlier.Ratio
0	GDPPER	1.554457	3	163	0.018405
1	URBPOP	-0.212327	0	163	0.000000
2	POPGRO	-0.181666	0	163	0.000000
3	LIFEXP	-0.329704	0	163	0.000000
4	TUBINC	1.747962	12	163	0.073620
5	DTHCMD	0.930709	0	163	0.000000
6	AGRLND	0.035315	0	163	0.000000
7	GHGEMI	9.299960	27	163	0.165644
8	METEMI	5.688689	20	163	0.122699
9	FORARE	0.563015	0	163	0.000000
10	CO2EMI	2.693585	11	163	0.067485
11	PM2EXP	-3.088403	37	163	0.226994
12	POPDEN	9.972806	20	163	0.122699
13	GDPCAP	2.311079	22	163	0.134969
14	EPISCO	0.635994	3	163	0.018405

	Pearson.Correlation.Coefficient	Correlation.PValue
GDPPER_GDPCAP	0.921010	8.158179e-68
GHGEMI_METEMI	0.905121	1.087643e-61
POPGRO_DTHCMD	0.759470	7.124695e-32
GDPPER_LIFEXP	0.755787	2.055178e-31
GDPCAP_EPISCO	0.696707	5.312642e-25
LIFEXP_GDPCAP	0.683834	8.321371e-24
GDPPER_EPISCO	0.680812	1.555304e-23
GDPPER_URBPOP	0.666394	2.781623e-22
GDPPER_CO2EMI	0.654958	2.450029e-21
TUBINC_DTHCMD	0.643615	1.936081e-20
URBPOP_LIFEXP	0.623997	5.669778e-19
LIFEXP_EPISCO	0.620271	1.048393e-18
URBPOP_GDPCAP	0.559181	8.624533e-15
CO2EMI_GDPCAP	0.550221	2.782997e-14
URBPOP_CO2EMI	0.550046	2.846393e-14
LIFEXP_CO2EMI	0.531305	2.951829e-13
URBPOP_EPISCO	0.510131	3.507463e-12
POPGRO_TUBINC	0.442339	3.384403e-09
DTHCMD_PM2EXP	0.283199	2.491837e-04
CO2EMI_EPISCO	0.282734	2.553620e-04

	T.Test.Statistic	T.Test.PValue
CANRAT_GDPCAP	-11.936988	6.247937e-24
CANRAT_EPISCO	-11.788870	1.605980e-23
CANRAT_LIFEXP	-10.979098	2.754214e-21
CANRAT_TUBINC	9.608760	1.463678e-17
CANRAT_DTHCMD	8.375558	2.552108e-14
CANRAT_CO2EMI	-7.030702	5.537463e-11
CANRAT_URBPOP	-6.541001	7.734940e-10
CANRAT_POPGRO	4.904817	2.269446e-06
CANRAT_GHGEMI	-2.243089	2.625563e-02
CANRAT_FORARE	-1.174143	2.420717e-01
CANRAT_POPDEN	-0.495221	6.211191e-01
CANRAT_AGRLND	-0.047628	9.620720e-01

	ChiSquare.Test.Statistic	ChiSquare.Test.PValue
CANRAT_HDICAT_VH	76.764134	1.926446e-18
CANRAT_HDICAT_M	13.860367	1.969074e-04
CANRAT_HDICAT_L	10.285575	1.340742e-03
CANRAT_HDICAT_H	9.080788	2.583087e-03

	metric_name	metric_value	model	set
0	Accuracy	0.947368	optimal_logistic_regression	train
1	Precision	0.870968	optimal_logistic_regression	train
2	Recall	0.931034	optimal_logistic_regression	train
3	F1	0.900000	optimal_logistic_regression	train
4	AUROC	0.941988	optimal_logistic_regression	train

	metric_name	metric_value	model	set
0	Accuracy	0.897959	optimal_logistic_regression	test
1	Precision	0.888889	optimal_logistic_regression	test
2	Recall	0.666667	optimal_logistic_regression	test
3	F1	0.761905	optimal_logistic_regression	test
4	AUROC	0.819820	optimal_logistic_regression	test

	metric_name	metric_value	model	set
0	Accuracy	0.973684	optimal_decision_tree	train
1	Precision	1.000000	optimal_decision_tree	train
2	Recall	0.896552	optimal_decision_tree	train
3	F1	0.945455	optimal_decision_tree	train
4	AUROC	0.948276	optimal_decision_tree	train

	metric_name	metric_value	model	set
0	Accuracy	0.857143	optimal_decision_tree	test
1	Precision	0.857143	optimal_decision_tree	test
2	Recall	0.500000	optimal_decision_tree	test
3	F1	0.631579	optimal_decision_tree	test
4	AUROC	0.736486	optimal_decision_tree	test

	metric_name	metric_value	model	set
0	Accuracy	0.956140	optimal_random_forest	train
1	Precision	0.928571	optimal_random_forest	train
2	Recall	0.896552	optimal_random_forest	train
3	F1	0.912281	optimal_random_forest	train
4	AUROC	0.936511	optimal_random_forest	train

	metric_name	metric_value	model	set
0	Accuracy	0.877551	optimal_random_forest	test
1	Precision	0.875000	optimal_random_forest	test
2	Recall	0.583333	optimal_random_forest	test
3	F1	0.700000	optimal_random_forest	test
4	AUROC	0.778153	optimal_random_forest	test

	metric_name	metric_value	model	set
0	Accuracy	0.947368	optimal_support_vector_machine	train
1	Precision	0.960000	optimal_support_vector_machine	train
2	Recall	0.827586	optimal_support_vector_machine	train
3	F1	0.888889	optimal_support_vector_machine	train
4	AUROC	0.907911	optimal_support_vector_machine	train

	metric_name	metric_value	model	set
0	Accuracy	0.894737	weighted_logistic_regression	train
1	Precision	0.707317	weighted_logistic_regression	train
2	Recall	1.000000	weighted_logistic_regression	train
3	F1	0.828571	weighted_logistic_regression	train
4	AUROC	0.929412	weighted_logistic_regression	train

	metric_name	metric_value	model	set
0	Accuracy	0.938776	weighted_logistic_regression	test
1	Precision	0.846154	weighted_logistic_regression	test
2	Recall	0.916667	weighted_logistic_regression	test
3	F1	0.880000	weighted_logistic_regression	test
4	AUROC	0.931306	weighted_logistic_regression	test

	metric_name	metric_value	model	set
0	Accuracy	0.956140	weighted_decision_tree	train
1	Precision	0.852941	weighted_decision_tree	train
2	Recall	1.000000	weighted_decision_tree	train
3	F1	0.920635	weighted_decision_tree	train
4	AUROC	0.970588	weighted_decision_tree	train

	metric_name	metric_value	model	set
0	Accuracy	0.973684	weighted_random_forest	train
1	Precision	0.906250	weighted_random_forest	train
2	Recall	1.000000	weighted_random_forest	train
3	F1	0.950820	weighted_random_forest	train
4	AUROC	0.982353	weighted_random_forest	train

	metric_name	metric_value	model	set
0	Accuracy	0.964912	weighted_support_vector_machine	train
1	Precision	0.962963	weighted_support_vector_machine	train
2	Recall	0.896552	weighted_support_vector_machine	train
3	F1	0.928571	weighted_support_vector_machine	train
4	AUROC	0.942394	weighted_support_vector_machine	train

	metric_name	metric_value	model	set
0	Accuracy	0.964912	upsampled_logistic_regression	train
1	Precision	0.903226	upsampled_logistic_regression	train
2	Recall	0.965517	upsampled_logistic_regression	train
3	F1	0.933333	upsampled_logistic_regression	train
4	AUROC	0.965112	upsampled_logistic_regression	train

	metric_name	metric_value	model	set
0	Accuracy	0.918367	upsampled_logistic_regression	test
1	Precision	0.900000	upsampled_logistic_regression	test
2	Recall	0.750000	upsampled_logistic_regression	test
3	F1	0.818182	upsampled_logistic_regression	test
4	AUROC	0.861486	upsampled_logistic_regression	test

Supervised Learning : Identifying Contributing Factors for Countries With High Cancer Rates Using Classification Algorithms With Class Imbalance Treatment¶

John Pauline Pineda December 1, 2023

1. Table of Contents ¶

1.1 Introduction ¶

1.1.1 Study Objectives ¶

1.1.2 Outcome ¶

1.1.3 Predictors ¶

1.2 Methodology ¶

1.2.1 Data Assessment ¶

1.2.2 Feature Selection ¶

1.2.3 Model Formulation ¶

1.2.4 Model Hyperparameter Tuning ¶

1.2.5 Model Performance Evaluation ¶

1.2.6 Model Presentation ¶

1.3. Results ¶

1.3.1. Data Preparation ¶

1.3.2 Data Quality Assessment ¶

1.3.3. Data Preprocessing ¶

1.3.3.1 Data Cleaning ¶

1.3.3.2 Missing Data Imputation ¶

1.3.3.3 Outlier Detection ¶

1.3.3.4 Collinearity ¶

1.3.3.5 Shape Transformation ¶

1.3.3.6 Centering and Scaling ¶

1.3.3.7 Data Encoding ¶

1.3.3.8 Preprocessed Data Description ¶

1.3.4 Data Exploration ¶

1.3.4.1 Exploratory Data Analysis ¶

1.3.4.2 Hypothesis Testing ¶

1.3.5. Model Development With Hyperparameter Tuning ¶

1.3.5.1 Premodelling Data Description ¶

1.3.5.2 Logistic Regression ¶

1.3.5.3 Decision Trees ¶

1.3.5.4 Random Forest ¶

1.3.5.5 Support Vector Machine ¶

1.3.6 Model Development With Class Weights ¶

1.3.6.1 Premodelling Data Description ¶

1.3.6.2 Logistic Regression ¶

1.3.6.3 Decision Trees ¶

1.3.6.4 Random Forest ¶

1.3.6.5 Support Vector Machine ¶

1.3.7 Model Development With SMOTE Upsampling ¶

1.3.7.1 Premodelling Data Description ¶

1.3.7.2 Logistic Regression ¶

1.3.7.3 Decision Trees ¶

1.3.7.4 Random Forest ¶

1.3.7.5 Support Vector Machine ¶

1.3.8 Model Development With CNN Downsampling ¶

1.3.8.1 Premodelling Data Description ¶

1.3.8.2 Logistic Regression ¶

1.3.8.3 Decision Trees ¶

1.3.8.4 Random Forest ¶

1.3.8.5 Support Vector Machine ¶

1.3.9 Model Development With Stacking Ensemble Learning ¶

1.3.9.1 Premodelling Data Description ¶

1.3.9.2 Logistic Regression ¶

1.3.10 Model Selection ¶

1.3.11 Model Presentation ¶

1.3.11.1 Odds Ratios ¶

1.3.11.2 Shapley Additive Explanations ¶

2. Summary ¶

3. References ¶

John Pauline Pineda

December 1, 2023

	metric_name	metric_value	model	set
0	Accuracy	0.921053	upsampled_decision_tree	train
1	Precision	0.763158	upsampled_decision_tree	train
2	Recall	1.000000	upsampled_decision_tree	train
3	F1	0.865672	upsampled_decision_tree	train
4	AUROC	0.947059	upsampled_decision_tree	train

	metric_name	metric_value	model	set
0	Accuracy	0.973684	upsampled_random_forest	train
1	Precision	0.906250	upsampled_random_forest	train
2	Recall	1.000000	upsampled_random_forest	train
3	F1	0.950820	upsampled_random_forest	train
4	AUROC	0.982353	upsampled_random_forest	train

	metric_name	metric_value	model	set
0	Accuracy	0.973684	upsampled_support_vector_machine	train
1	Precision	0.933333	upsampled_support_vector_machine	train
2	Recall	0.965517	upsampled_support_vector_machine	train
3	F1	0.949153	upsampled_support_vector_machine	train
4	AUROC	0.970994	upsampled_support_vector_machine	train

	metric_name	metric_value	model	set
0	Accuracy	0.947368	downsampled_logistic_regression	train
1	Precision	0.848485	downsampled_logistic_regression	train
2	Recall	0.965517	downsampled_logistic_regression	train
3	F1	0.903226	downsampled_logistic_regression	train
4	AUROC	0.953347	downsampled_logistic_regression	train

	metric_name	metric_value	model	set
0	Accuracy	0.938596	downsampled_decision_tree	train
1	Precision	0.923077	downsampled_decision_tree	train
2	Recall	0.827586	downsampled_decision_tree	train
3	F1	0.872727	downsampled_decision_tree	train
4	AUROC	0.902028	downsampled_decision_tree	train

	metric_name	metric_value	model	set
0	Accuracy	0.964912	downsampled_random_forest	train
1	Precision	0.903226	downsampled_random_forest	train
2	Recall	0.965517	downsampled_random_forest	train
3	F1	0.933333	downsampled_random_forest	train
4	AUROC	0.965112	downsampled_random_forest	train

	metric_name	metric_value	model	set
0	Accuracy	0.956140	downsampled_support_vector_machine	train
1	Precision	0.928571	downsampled_support_vector_machine	train
2	Recall	0.896552	downsampled_support_vector_machine	train
3	F1	0.912281	downsampled_support_vector_machine	train
4	AUROC	0.936511	downsampled_support_vector_machine	train

	metric_name	metric_value	model	set
0	Accuracy	0.973684	stacked_logistic_regression	train
1	Precision	0.933333	stacked_logistic_regression	train
2	Recall	0.965517	stacked_logistic_regression	train
3	F1	0.949153	stacked_logistic_regression	train
4	AUROC	0.970994	stacked_logistic_regression	train

	model_predictor	model_coefficient	odds_ratio
0	URBPOP	0.093942	1.098496
1	LIFEXP	0.472572	1.604115
2	TUBINC	-0.412017	0.662313
3	DTHCMD	-0.534044	0.586229
4	CO2EMI	-0.150646	0.860152
5	GDPCAP	0.596013	1.814868
6	EPISCO	1.135875	3.113897
7	HDICAT_VH	0.268438	1.307920