##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import itertools
import os
import pickle
%matplotlib inline

from operator import add,mul,truediv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from scipy import stats
from scipy.stats import pointbiserialr, chi2_contingency

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, KFold, cross_val_score
from sklearn.inspection import permutation_importance

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"
DATASETS_FINAL_PATH = r"datasets\final\complete"
DATASETS_FINAL_TRAIN_PATH = r"datasets\final\train"
DATASETS_FINAL_TRAIN_FEATURES_PATH = r"datasets\final\train\features"
DATASETS_FINAL_TRAIN_TARGET_PATH = r"datasets\final\train\target"
DATASETS_FINAL_VALIDATION_PATH = r"datasets\final\validation"
DATASETS_FINAL_VALIDATION_FEATURES_PATH = r"datasets\final\validation\features"
DATASETS_FINAL_VALIDATION_TARGET_PATH = r"datasets\final\validation\target"
DATASETS_FINAL_TEST_PATH = r"datasets\final\test"
DATASETS_FINAL_TEST_FEATURES_PATH = r"datasets\final\test\features"
DATASETS_FINAL_TEST_TARGET_PATH = r"datasets\final\test\target"
DATASETS_PREPROCESSED_PATH = r"datasets\preprocessed"
DATASETS_PREPROCESSED_TRAIN_PATH = r"datasets\preprocessed\train"
DATASETS_PREPROCESSED_TRAIN_FEATURES_PATH = r"datasets\preprocessed\train\features"
DATASETS_PREPROCESSED_TRAIN_TARGET_PATH = r"datasets\preprocessed\train\target"
DATASETS_PREPROCESSED_VALIDATION_PATH = r"datasets\preprocessed\validation"
DATASETS_PREPROCESSED_VALIDATION_FEATURES_PATH = r"datasets\preprocessed\validation\features"
DATASETS_PREPROCESSED_VALIDATION_TARGET_PATH = r"datasets\preprocessed\validation\target"
DATASETS_PREPROCESSED_TEST_PATH = r"datasets\preprocessed\test"
DATASETS_PREPROCESSED_TEST_FEATURES_PATH = r"datasets\preprocessed\test\features"
DATASETS_PREPROCESSED_TEST_TARGET_PATH = r"datasets\preprocessed\test\target"
MODELS_PATH = r"models"

##################################
# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
thyroid_cancer = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "Thyroid_Diff.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(thyroid_cancer.shape)

Dataset Dimensions:

(383, 17)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(thyroid_cancer.dtypes)

Column Names and Data Types:

Age                      int64
Gender                  object
Smoking                 object
Hx Smoking              object
Hx Radiotherapy         object
Thyroid Function        object
Physical Examination    object
Adenopathy              object
Pathology               object
Focality                object
Risk                    object
T                       object
N                       object
M                       object
Stage                   object
Response                object
Recurred                object
dtype: object

##################################
# Renaming and standardizing the column names
# to replace blanks with undercores
##################################
thyroid_cancer.columns = thyroid_cancer.columns.str.replace(" ", "_")

##################################
# Taking a snapshot of the dataset
##################################
thyroid_cancer.head()

##################################
# Selecting categorical columns (both object and categorical types)
# and listing the unique categorical levels
##################################
cat_cols = thyroid_cancer.select_dtypes(include=["object", "category"]).columns
for col in cat_cols:
    print(f"Categorical | Object Column: {col}")
    print(thyroid_cancer[col].unique())  
    print("-" * 40)

Categorical | Object Column: Gender
['F' 'M']
----------------------------------------
Categorical | Object Column: Smoking
['No' 'Yes']
----------------------------------------
Categorical | Object Column: Hx_Smoking
['No' 'Yes']
----------------------------------------
Categorical | Object Column: Hx_Radiotherapy
['No' 'Yes']
----------------------------------------
Categorical | Object Column: Thyroid_Function
['Euthyroid' 'Clinical Hyperthyroidism' 'Clinical Hypothyroidism'
 'Subclinical Hyperthyroidism' 'Subclinical Hypothyroidism']
----------------------------------------
Categorical | Object Column: Physical_Examination
['Single nodular goiter-left' 'Multinodular goiter'
 'Single nodular goiter-right' 'Normal' 'Diffuse goiter']
----------------------------------------
Categorical | Object Column: Adenopathy
['No' 'Right' 'Extensive' 'Left' 'Bilateral' 'Posterior']
----------------------------------------
Categorical | Object Column: Pathology
['Micropapillary' 'Papillary' 'Follicular' 'Hurthel cell']
----------------------------------------
Categorical | Object Column: Focality
['Uni-Focal' 'Multi-Focal']
----------------------------------------
Categorical | Object Column: Risk
['Low' 'Intermediate' 'High']
----------------------------------------
Categorical | Object Column: T
['T1a' 'T1b' 'T2' 'T3a' 'T3b' 'T4a' 'T4b']
----------------------------------------
Categorical | Object Column: N
['N0' 'N1b' 'N1a']
----------------------------------------
Categorical | Object Column: M
['M0' 'M1']
----------------------------------------
Categorical | Object Column: Stage
['I' 'II' 'IVB' 'III' 'IVA']
----------------------------------------
Categorical | Object Column: Response
['Indeterminate' 'Excellent' 'Structural Incomplete'
 'Biochemical Incomplete']
----------------------------------------
Categorical | Object Column: Recurred
['No' 'Yes']
----------------------------------------

##################################
# Correcting a category level
##################################
thyroid_cancer["Pathology"] = thyroid_cancer["Pathology"].replace("Hurthel cell", "Hurthle Cell")

##################################
# Setting the levels of the categorical variables
##################################
thyroid_cancer['Recurred'] = thyroid_cancer['Recurred'].astype('category')
thyroid_cancer['Recurred'] = thyroid_cancer['Recurred'].cat.set_categories(['No', 'Yes'], ordered=True)
thyroid_cancer['Gender'] = thyroid_cancer['Gender'].astype('category')
thyroid_cancer['Gender'] = thyroid_cancer['Gender'].cat.set_categories(['M', 'F'], ordered=True)
thyroid_cancer['Smoking'] = thyroid_cancer['Smoking'].astype('category')
thyroid_cancer['Smoking'] = thyroid_cancer['Smoking'].cat.set_categories(['No', 'Yes'], ordered=True)
thyroid_cancer['Hx_Smoking'] = thyroid_cancer['Hx_Smoking'].astype('category')
thyroid_cancer['Hx_Smoking'] = thyroid_cancer['Hx_Smoking'].cat.set_categories(['No', 'Yes'], ordered=True)
thyroid_cancer['Hx_Radiotherapy'] = thyroid_cancer['Hx_Radiotherapy'].astype('category')
thyroid_cancer['Hx_Radiotherapy'] = thyroid_cancer['Hx_Radiotherapy'].cat.set_categories(['No', 'Yes'], ordered=True)
thyroid_cancer['Thyroid_Function'] = thyroid_cancer['Thyroid_Function'].astype('category')
thyroid_cancer['Thyroid_Function'] = thyroid_cancer['Thyroid_Function'].cat.set_categories(['Euthyroid', 'Subclinical Hypothyroidism', 'Subclinical Hyperthyroidism', 'Clinical Hypothyroidism', 'Clinical Hyperthyroidism'], ordered=True)
thyroid_cancer['Physical_Examination'] = thyroid_cancer['Physical_Examination'].astype('category')
thyroid_cancer['Physical_Examination'] = thyroid_cancer['Physical_Examination'].cat.set_categories(['Normal', 'Single nodular goiter-left', 'Single nodular goiter-right', 'Multinodular goiter', 'Diffuse goiter'], ordered=True)
thyroid_cancer['Adenopathy'] = thyroid_cancer['Adenopathy'].astype('category')
thyroid_cancer['Adenopathy'] = thyroid_cancer['Adenopathy'].cat.set_categories(['No', 'Left', 'Right', 'Bilateral', 'Posterior', 'Extensive'], ordered=True)
thyroid_cancer['Pathology'] = thyroid_cancer['Pathology'].astype('category')
thyroid_cancer['Pathology'] = thyroid_cancer['Pathology'].cat.set_categories(['Hurthle Cell', 'Follicular', 'Micropapillary', 'Papillary'], ordered=True)
thyroid_cancer['Focality'] = thyroid_cancer['Focality'].astype('category')
thyroid_cancer['Focality'] = thyroid_cancer['Focality'].cat.set_categories(['Uni-Focal', 'Multi-Focal'], ordered=True)
thyroid_cancer['Risk'] = thyroid_cancer['Risk'].astype('category')
thyroid_cancer['Risk'] = thyroid_cancer['Risk'].cat.set_categories(['Low', 'Intermediate', 'High'], ordered=True)
thyroid_cancer['T'] = thyroid_cancer['T'].astype('category')
thyroid_cancer['T'] = thyroid_cancer['T'].cat.set_categories(['T1a', 'T1b', 'T2', 'T3a', 'T3b', 'T4a', 'T4b'], ordered=True)
thyroid_cancer['N'] = thyroid_cancer['N'].astype('category')
thyroid_cancer['N'] = thyroid_cancer['N'].cat.set_categories(['N0', 'N1a', 'N1b'], ordered=True)
thyroid_cancer['M'] = thyroid_cancer['M'].astype('category')
thyroid_cancer['M'] = thyroid_cancer['M'].cat.set_categories(['M0', 'M1'], ordered=True)
thyroid_cancer['Stage'] = thyroid_cancer['Stage'].astype('category')
thyroid_cancer['Stage'] = thyroid_cancer['Stage'].cat.set_categories(['I', 'II', 'III', 'IVA', 'IVB'], ordered=True)
thyroid_cancer['Response'] = thyroid_cancer['Response'].astype('category')
thyroid_cancer['Response'] = thyroid_cancer['Response'].cat.set_categories(['Excellent', 'Structural Incomplete', 'Biochemical Incomplete', 'Indeterminate'], ordered=True)

##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(thyroid_cancer.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Performing a general exploration of the categorical variables
##################################
print('Categorical Variable Summary:')
display(thyroid_cancer.describe(include='category').transpose())

Categorical Variable Summary:

##################################
# Performing a general exploration of the categorical variable levels
# based on the ordered categories
##################################
ordered_cat_cols = thyroid_cancer.select_dtypes(include=["category"]).columns
for col in ordered_cat_cols:
    print(f"Column: {col}")
    print("Absolute Frequencies:")
    print(thyroid_cancer[col].value_counts().reindex(thyroid_cancer[col].cat.categories))
    print("\nNormalized Frequencies:")
    print(thyroid_cancer[col].value_counts(normalize=True).reindex(thyroid_cancer[col].cat.categories))
    print("-" * 50)

Column: Gender
Absolute Frequencies:
M     71
F    312
Name: count, dtype: int64

Normalized Frequencies:
M    0.185379
F    0.814621
Name: proportion, dtype: float64
--------------------------------------------------
Column: Smoking
Absolute Frequencies:
No     334
Yes     49
Name: count, dtype: int64

Normalized Frequencies:
No     0.872063
Yes    0.127937
Name: proportion, dtype: float64
--------------------------------------------------
Column: Hx_Smoking
Absolute Frequencies:
No     355
Yes     28
Name: count, dtype: int64

Normalized Frequencies:
No     0.926893
Yes    0.073107
Name: proportion, dtype: float64
--------------------------------------------------
Column: Hx_Radiotherapy
Absolute Frequencies:
No     376
Yes      7
Name: count, dtype: int64

Normalized Frequencies:
No     0.981723
Yes    0.018277
Name: proportion, dtype: float64
--------------------------------------------------
Column: Thyroid_Function
Absolute Frequencies:
Euthyroid                      332
Subclinical Hypothyroidism      14
Subclinical Hyperthyroidism      5
Clinical Hypothyroidism         12
Clinical Hyperthyroidism        20
Name: count, dtype: int64

Normalized Frequencies:
Euthyroid                      0.866841
Subclinical Hypothyroidism     0.036554
Subclinical Hyperthyroidism    0.013055
Clinical Hypothyroidism        0.031332
Clinical Hyperthyroidism       0.052219
Name: proportion, dtype: float64
--------------------------------------------------
Column: Physical_Examination
Absolute Frequencies:
Normal                           7
Single nodular goiter-left      89
Single nodular goiter-right    140
Multinodular goiter            140
Diffuse goiter                   7
Name: count, dtype: int64

Normalized Frequencies:
Normal                         0.018277
Single nodular goiter-left     0.232376
Single nodular goiter-right    0.365535
Multinodular goiter            0.365535
Diffuse goiter                 0.018277
Name: proportion, dtype: float64
--------------------------------------------------
Column: Adenopathy
Absolute Frequencies:
No           277
Left          17
Right         48
Bilateral     32
Posterior      2
Extensive      7
Name: count, dtype: int64

Normalized Frequencies:
No           0.723238
Left         0.044386
Right        0.125326
Bilateral    0.083551
Posterior    0.005222
Extensive    0.018277
Name: proportion, dtype: float64
--------------------------------------------------
Column: Pathology
Absolute Frequencies:
Hurthle Cell       20
Follicular         28
Micropapillary     48
Papillary         287
Name: count, dtype: int64

Normalized Frequencies:
Hurthle Cell      0.052219
Follicular        0.073107
Micropapillary    0.125326
Papillary         0.749347
Name: proportion, dtype: float64
--------------------------------------------------
Column: Focality
Absolute Frequencies:
Uni-Focal      247
Multi-Focal    136
Name: count, dtype: int64

Normalized Frequencies:
Uni-Focal      0.644909
Multi-Focal    0.355091
Name: proportion, dtype: float64
--------------------------------------------------
Column: Risk
Absolute Frequencies:
Low             249
Intermediate    102
High             32
Name: count, dtype: int64

Normalized Frequencies:
Low             0.650131
Intermediate    0.266319
High            0.083551
Name: proportion, dtype: float64
--------------------------------------------------
Column: T
Absolute Frequencies:
T1a     49
T1b     43
T2     151
T3a     96
T3b     16
T4a     20
T4b      8
Name: count, dtype: int64

Normalized Frequencies:
T1a    0.127937
T1b    0.112272
T2     0.394256
T3a    0.250653
T3b    0.041775
T4a    0.052219
T4b    0.020888
Name: proportion, dtype: float64
--------------------------------------------------
Column: N
Absolute Frequencies:
N0     268
N1a     22
N1b     93
Name: count, dtype: int64

Normalized Frequencies:
N0     0.699739
N1a    0.057441
N1b    0.242820
Name: proportion, dtype: float64
--------------------------------------------------
Column: M
Absolute Frequencies:
M0    365
M1     18
Name: count, dtype: int64

Normalized Frequencies:
M0    0.953003
M1    0.046997
Name: proportion, dtype: float64
--------------------------------------------------
Column: Stage
Absolute Frequencies:
I      333
II      32
III      4
IVA      3
IVB     11
Name: count, dtype: int64

Normalized Frequencies:
I      0.869452
II     0.083551
III    0.010444
IVA    0.007833
IVB    0.028721
Name: proportion, dtype: float64
--------------------------------------------------
Column: Response
Absolute Frequencies:
Excellent                 208
Structural Incomplete      91
Biochemical Incomplete     23
Indeterminate              61
Name: count, dtype: int64

Normalized Frequencies:
Excellent                 0.543081
Structural Incomplete     0.237598
Biochemical Incomplete    0.060052
Indeterminate             0.159269
Name: proportion, dtype: float64
--------------------------------------------------
Column: Recurred
Absolute Frequencies:
No     275
Yes    108
Name: count, dtype: int64

Normalized Frequencies:
No     0.718016
Yes    0.281984
Name: proportion, dtype: float64
--------------------------------------------------

##################################
# Counting the number of duplicated rows
##################################
thyroid_cancer.duplicated().sum()

19

##################################
# Exploring the duplicated rows
##################################
duplicated_rows = thyroid_cancer[thyroid_cancer.duplicated(keep=False)]
display(duplicated_rows)

##################################
# Checking if duplicated rows have identical values across all columns
##################################
num_unique_dup_rows = duplicated_rows.drop_duplicates().shape[0]
num_total_dup_rows = duplicated_rows.shape[0]
if num_unique_dup_rows == 1:
    print("All duplicated rows have the same values across all columns.")
else:
    print(f"There are {num_unique_dup_rows} unique versions among the {num_total_dup_rows} duplicated rows.")

There are 16 unique versions among the 35 duplicated rows.

##################################
# Counting the unique variations among duplicated rows
##################################
unique_dup_variations = duplicated_rows.drop_duplicates()
variation_counts = duplicated_rows.value_counts().reset_index(name="Count")
print("Unique duplicated row variations and their counts:")
display(variation_counts)

Unique duplicated row variations and their counts:

##################################
# Removing the duplicated rows and
# retaining only the first occurrence
##################################
thyroid_cancer_row_filtered = thyroid_cancer.drop_duplicates(keep="first")
print('Dataset Dimensions: ')
display(thyroid_cancer_row_filtered.shape)

Dataset Dimensions:

(364, 17)

##################################
# Gathering the data types for each column
##################################
data_type_list = list(thyroid_cancer_row_filtered.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(thyroid_cancer_row_filtered.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(thyroid_cancer_row_filtered)] * len(thyroid_cancer_row_filtered.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(thyroid_cancer_row_filtered.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(thyroid_cancer_row_filtered.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])

0

##################################
# Identifying the rows
# with Fill.Rate < 0.90
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<0.90)]

##################################
# Gathering the indices for each observation
##################################
row_index_list = thyroid_cancer_row_filtered.index

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(thyroid_cancer_row_filtered.columns)] * len(thyroid_cancer_row_filtered))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(thyroid_cancer_row_filtered.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Identifying the rows
# with missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_index_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Missing.Rate > 0.00
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.00)])

0

##################################
# Formulating the dataset
# with numeric columns only
##################################
thyroid_cancer_numeric = thyroid_cancer_row_filtered.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = thyroid_cancer_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = thyroid_cancer_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = thyroid_cancer_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = thyroid_cancer_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = thyroid_cancer_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[0] for x in thyroid_cancer_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[1] for x in thyroid_cancer_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [thyroid_cancer_numeric[x].isin([thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in thyroid_cancer_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [thyroid_cancer_numeric[x].isin([thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in thyroid_cancer_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = thyroid_cancer_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(thyroid_cancer_numeric)] * len(thyroid_cancer_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = thyroid_cancer_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = thyroid_cancer_numeric.kurtosis()

##################################
# Generating a column quality summary for the numeric column
##################################
numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

0

##################################
# Formulating the dataset
# with categorical columns only
##################################
thyroid_cancer_categorical = thyroid_cancer_row_filtered.select_dtypes(include='category')

##################################
# Gathering the variable names for the categorical column
##################################
categorical_variable_name_list = thyroid_cancer_categorical.columns

##################################
# Gathering the first mode values for each categorical column
##################################
categorical_first_mode_list = [thyroid_cancer_row_filtered[x].value_counts().index.tolist()[0] for x in thyroid_cancer_categorical]

##################################
# Gathering the second mode values for each categorical column
##################################
categorical_second_mode_list = [thyroid_cancer_row_filtered[x].value_counts().index.tolist()[1] for x in thyroid_cancer_categorical]

##################################
# Gathering the count of first mode values for each categorical column
##################################
categorical_first_mode_count_list = [thyroid_cancer_categorical[x].isin([thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in thyroid_cancer_categorical]

##################################
# Gathering the count of second mode values for each categorical column
##################################
categorical_second_mode_count_list = [thyroid_cancer_categorical[x].isin([thyroid_cancer_row_filtered[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in thyroid_cancer_categorical]

##################################
# Gathering the first mode to second mode ratio for each categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each categorical column
##################################
categorical_unique_count_list = thyroid_cancer_categorical.nunique(dropna=True)

##################################
# Gathering the number of observations for each categorical column
##################################
categorical_row_count_list = list([len(thyroid_cancer_categorical)] * len(thyroid_cancer_categorical.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

##################################
# Generating a column quality summary for the categorical columns
##################################
categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                    categorical_first_mode_list,
                                                    categorical_second_mode_list,
                                                    categorical_first_mode_count_list,
                                                    categorical_second_mode_count_list,
                                                    categorical_first_second_mode_ratio_list,
                                                    categorical_unique_count_list,
                                                    categorical_row_count_list,
                                                    categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

8

##################################
# Identifying the categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
display(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)].sort_values(by=['First.Second.Mode.Ratio'], ascending=False))

##################################
# Counting the number of categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Creating a dataset copy
# of the row filtered data
##################################
thyroid_cancer_baseline = thyroid_cancer_row_filtered.copy()

##################################
# Performing a general exploration
# of the baseline dataset
##################################
print('Final Dataset Dimensions: ')
display(thyroid_cancer_baseline.shape)

Final Dataset Dimensions:

(364, 17)

##################################
# Obtaining the distribution of
# of the target variable
##################################
print('Target Variable Breakdown: ')
thyroid_cancer_breakdown = thyroid_cancer_baseline.groupby('Recurred', observed=True).size().reset_index(name='Count')
thyroid_cancer_breakdown['Percentage'] = (thyroid_cancer_breakdown['Count'] / len(thyroid_cancer_baseline)) * 100
display(thyroid_cancer_breakdown)

Target Variable Breakdown:

##################################
# Formulating the train and test data
# from the final dataset
# by applying stratification and
# using a 75-25 ratio
##################################
thyroid_cancer_train_initial, thyroid_cancer_test = train_test_split(thyroid_cancer_baseline, 
                                                               test_size=0.25, 
                                                               stratify=thyroid_cancer_baseline['Recurred'], 
                                                               random_state=987654321)

##################################
# Performing a general exploration
# of the initial training dataset
##################################
X_train_initial = thyroid_cancer_train_initial.drop('Recurred', axis = 1)
y_train_initial = thyroid_cancer_train_initial['Recurred']
print('Initial Train Dataset Dimensions: ')
display(X_train_initial.shape)
display(y_train_initial.shape)
print('Initial Train Target Variable Breakdown: ')
display(y_train_initial.value_counts())
print('Initial Train Target Variable Proportion: ')
display(y_train_initial.value_counts(normalize = True))

Initial Train Dataset Dimensions:

(273, 16)

(273,)

Initial Train Target Variable Breakdown:

Recurred
No     192
Yes     81
Name: count, dtype: int64

Initial Train Target Variable Proportion:

Recurred
No     0.703297
Yes    0.296703
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the test dataset
##################################
X_test = thyroid_cancer_test.drop('Recurred', axis = 1)
y_test = thyroid_cancer_test['Recurred']
print('Test Dataset Dimensions: ')
display(X_test.shape)
display(y_test.shape)
print('Test Target Variable Breakdown: ')
display(y_test.value_counts())
print('Test Target Variable Proportion: ')
display(y_test.value_counts(normalize = True))

Test Dataset Dimensions:

(91, 16)

(91,)

Test Target Variable Breakdown:

Recurred
No     64
Yes    27
Name: count, dtype: int64

Test Target Variable Proportion:

Recurred
No     0.703297
Yes    0.296703
Name: proportion, dtype: float64

##################################
# Formulating the train and validation data
# from the train dataset
# by applying stratification and
# using a 75-25 ratio
##################################
thyroid_cancer_train, thyroid_cancer_validation = train_test_split(thyroid_cancer_train_initial, 
                                                             test_size=0.25, 
                                                             stratify=thyroid_cancer_train_initial['Recurred'], 
                                                             random_state=987654321)

##################################
# Performing a general exploration
# of the final training dataset
##################################
X_train = thyroid_cancer_train.drop('Recurred', axis = 1)
y_train = thyroid_cancer_train['Recurred']
print('Final Train Dataset Dimensions: ')
display(X_train.shape)
display(y_train.shape)
print('Final Train Target Variable Breakdown: ')
display(y_train.value_counts())
print('Final Train Target Variable Proportion: ')
display(y_train.value_counts(normalize = True))

Final Train Dataset Dimensions:

(204, 16)

(204,)

Final Train Target Variable Breakdown:

Recurred
No     143
Yes     61
Name: count, dtype: int64

Final Train Target Variable Proportion:

Recurred
No     0.70098
Yes    0.29902
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the validation dataset
##################################
X_validation = thyroid_cancer_validation.drop('Recurred', axis = 1)
y_validation = thyroid_cancer_validation['Recurred']
print('Validation Dataset Dimensions: ')
display(X_validation.shape)
display(y_validation.shape)
print('Validation Target Variable Breakdown: ')
display(y_validation.value_counts())
print('Validation Target Variable Proportion: ')
display(y_validation.value_counts(normalize = True))

Validation Dataset Dimensions:

(69, 16)

(69,)

Validation Target Variable Breakdown:

Recurred
No     49
Yes    20
Name: count, dtype: int64

Validation Target Variable Proportion:

Recurred
No     0.710145
Yes    0.289855
Name: proportion, dtype: float64

##################################
# Saving the training data
# to the DATASETS_FINAL_TRAIN_PATH
# and DATASETS_FINAL_TRAIN_FEATURES_PATH
# and DATASETS_FINAL_TRAIN_TARGET_PATH
##################################
thyroid_cancer_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_PATH, "thyroid_cancer_train.csv"), index=False)
X_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_FEATURES_PATH, "X_train.csv"), index=False)
y_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_TARGET_PATH, "y_train.csv"), index=False)

##################################
# Saving the validation data
# to the DATASETS_FINAL_VALIDATION_PATH
# and DATASETS_FINAL_VALIDATION_FEATURE_PATH
# and DATASETS_FINAL_VALIDATION_TARGET_PATH
##################################
thyroid_cancer_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_PATH, "thyroid_cancer_validation.csv"), index=False)
X_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_FEATURES_PATH, "X_validation.csv"), index=False)
y_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_TARGET_PATH, "y_validation.csv"), index=False)

##################################
# Saving the test data
# to the DATASETS_FINAL_TEST_PATH
# and DATASETS_FINAL_TEST_FEATURES_PATH
# and DATASETS_FINAL_TEST_TARGET_PATH
##################################
thyroid_cancer_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_PATH, "thyroid_cancer_test.csv"), index=False)
X_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_FEATURES_PATH, "X_test.csv"), index=False)
y_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_TARGET_PATH, "y_test.csv"), index=False)

##################################
# Segregating the target
# and predictor variables
##################################
thyroid_cancer_train_predictors = thyroid_cancer_train.iloc[:,:-1].columns
thyroid_cancer_train_predictors_numeric = thyroid_cancer_train.iloc[:,:-1].loc[:, thyroid_cancer_train.iloc[:,:-1].columns == 'Age'].columns
thyroid_cancer_train_predictors_categorical = thyroid_cancer_train.iloc[:,:-1].loc[:,thyroid_cancer_train.iloc[:,:-1].columns != 'Age'].columns

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = thyroid_cancer_train_predictors_numeric

##################################
# Segregating the target variable
# and numeric predictors
##################################
histogram_grouping_variable = 'Recurred'
histogram_frequency_variable = numeric_variable_name_list.values[0]

##################################
# Comparing the numeric predictors
# grouped by the target variable
##################################
colors = plt.get_cmap('tab10').colors
plt.figure(figsize=(7, 5))
group_no = thyroid_cancer_train[thyroid_cancer_train[histogram_grouping_variable] == 'No'][histogram_frequency_variable]
group_yes = thyroid_cancer_train[thyroid_cancer_train[histogram_grouping_variable] == 'Yes'][histogram_frequency_variable]
plt.hist(group_no, bins=20, alpha=0.5, color=colors[0], label='No', edgecolor='black')
plt.hist(group_yes, bins=20, alpha=0.5, color=colors[1], label='Yes', edgecolor='black')
plt.title(f'{histogram_grouping_variable} Versus {histogram_frequency_variable}')
plt.xlabel(histogram_frequency_variable)
plt.ylabel('Frequency')
plt.legend()
plt.show()

##################################
# Performing a general exploration of the categorical variable levels
# based on the ordered categories
##################################
ordered_cat_cols = thyroid_cancer_train.select_dtypes(include=["category"]).columns
for col in ordered_cat_cols:
    print(f"Column: {col}")
    print("Absolute Frequencies:")
    print(thyroid_cancer_train[col].value_counts().reindex(thyroid_cancer_train[col].cat.categories))
    print("\nNormalized Frequencies:")
    print(thyroid_cancer_train[col].value_counts(normalize=True).reindex(thyroid_cancer_train[col].cat.categories))
    print("-" * 50)

Column: Gender
Absolute Frequencies:
M     44
F    160
Name: count, dtype: int64

Normalized Frequencies:
M    0.215686
F    0.784314
Name: proportion, dtype: float64
--------------------------------------------------
Column: Smoking
Absolute Frequencies:
No     177
Yes     27
Name: count, dtype: int64

Normalized Frequencies:
No     0.867647
Yes    0.132353
Name: proportion, dtype: float64
--------------------------------------------------
Column: Hx_Smoking
Absolute Frequencies:
No     193
Yes     11
Name: count, dtype: int64

Normalized Frequencies:
No     0.946078
Yes    0.053922
Name: proportion, dtype: float64
--------------------------------------------------
Column: Hx_Radiotherapy
Absolute Frequencies:
No     202
Yes      2
Name: count, dtype: int64

Normalized Frequencies:
No     0.990196
Yes    0.009804
Name: proportion, dtype: float64
--------------------------------------------------
Column: Thyroid_Function
Absolute Frequencies:
Euthyroid                      171
Subclinical Hypothyroidism      10
Subclinical Hyperthyroidism      3
Clinical Hypothyroidism          7
Clinical Hyperthyroidism        13
Name: count, dtype: int64

Normalized Frequencies:
Euthyroid                      0.838235
Subclinical Hypothyroidism     0.049020
Subclinical Hyperthyroidism    0.014706
Clinical Hypothyroidism        0.034314
Clinical Hyperthyroidism       0.063725
Name: proportion, dtype: float64
--------------------------------------------------
Column: Physical_Examination
Absolute Frequencies:
Normal                          4
Single nodular goiter-left     50
Single nodular goiter-right    68
Multinodular goiter            79
Diffuse goiter                  3
Name: count, dtype: int64

Normalized Frequencies:
Normal                         0.019608
Single nodular goiter-left     0.245098
Single nodular goiter-right    0.333333
Multinodular goiter            0.387255
Diffuse goiter                 0.014706
Name: proportion, dtype: float64
--------------------------------------------------
Column: Adenopathy
Absolute Frequencies:
No           144
Left          14
Right         21
Bilateral     19
Posterior      2
Extensive      4
Name: count, dtype: int64

Normalized Frequencies:
No           0.705882
Left         0.068627
Right        0.102941
Bilateral    0.093137
Posterior    0.009804
Extensive    0.019608
Name: proportion, dtype: float64
--------------------------------------------------
Column: Pathology
Absolute Frequencies:
Hurthle Cell       15
Follicular         14
Micropapillary     26
Papillary         149
Name: count, dtype: int64

Normalized Frequencies:
Hurthle Cell      0.073529
Follicular        0.068627
Micropapillary    0.127451
Papillary         0.730392
Name: proportion, dtype: float64
--------------------------------------------------
Column: Focality
Absolute Frequencies:
Uni-Focal      129
Multi-Focal     75
Name: count, dtype: int64

Normalized Frequencies:
Uni-Focal      0.632353
Multi-Focal    0.367647
Name: proportion, dtype: float64
--------------------------------------------------
Column: Risk
Absolute Frequencies:
Low             127
Intermediate     60
High             17
Name: count, dtype: int64

Normalized Frequencies:
Low             0.622549
Intermediate    0.294118
High            0.083333
Name: proportion, dtype: float64
--------------------------------------------------
Column: T
Absolute Frequencies:
T1a    26
T1b    21
T2     73
T3a    58
T3b    10
T4a    12
T4b     4
Name: count, dtype: int64

Normalized Frequencies:
T1a    0.127451
T1b    0.102941
T2     0.357843
T3a    0.284314
T3b    0.049020
T4a    0.058824
T4b    0.019608
Name: proportion, dtype: float64
--------------------------------------------------
Column: N
Absolute Frequencies:
N0     139
N1a     11
N1b     54
Name: count, dtype: int64

Normalized Frequencies:
N0     0.681373
N1a    0.053922
N1b    0.264706
Name: proportion, dtype: float64
--------------------------------------------------
Column: M
Absolute Frequencies:
M0    194
M1     10
Name: count, dtype: int64

Normalized Frequencies:
M0    0.95098
M1    0.04902
Name: proportion, dtype: float64
--------------------------------------------------
Column: Stage
Absolute Frequencies:
I      174
II      21
III      2
IVA      2
IVB      5
Name: count, dtype: int64

Normalized Frequencies:
I      0.852941
II     0.102941
III    0.009804
IVA    0.009804
IVB    0.024510
Name: proportion, dtype: float64
--------------------------------------------------
Column: Response
Absolute Frequencies:
Excellent                 109
Structural Incomplete      53
Biochemical Incomplete      8
Indeterminate              34
Name: count, dtype: int64

Normalized Frequencies:
Excellent                 0.534314
Structural Incomplete     0.259804
Biochemical Incomplete    0.039216
Indeterminate             0.166667
Name: proportion, dtype: float64
--------------------------------------------------
Column: Recurred
Absolute Frequencies:
No     143
Yes     61
Name: count, dtype: int64

Normalized Frequencies:
No     0.70098
Yes    0.29902
Name: proportion, dtype: float64
--------------------------------------------------

##################################
# Segregating the target variable
# and categorical predictors
##################################
proportion_y_variables = thyroid_cancer_train_predictors_categorical
proportion_x_variable = 'Recurred'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 5
num_cols = 3

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 25))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(proportion_y_variables):
    ax = axes[i]
    category_counts = thyroid_cancer_train.groupby([proportion_x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{proportion_x_variable} Versus {y_variable}')
    ax.set_xlabel(proportion_x_variable)
    ax.set_ylabel('Proportions')
    ax.legend(loc="lower center")

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Removing predictors observed with extreme
# near-zero variance and a limited number of levels
##################################
thyroid_cancer_train_column_filtered = thyroid_cancer_train.drop(columns=['Hx_Radiotherapy','M','Hx_Smoking'])
thyroid_cancer_train_column_filtered.head()

##################################
# Merging small categories into broader groups 
# for certain categorical predictors
# to ensure sufficient representation in statistical models 
# and prevent sparsity issues in cross-validation
##################################
thyroid_cancer_train_column_filtered['Thyroid_Function'] = thyroid_cancer_train_column_filtered['Thyroid_Function'].map(lambda x: 'Euthyroid' if (x in ['Euthyroid'])  else 'Hypothyroidism or Hyperthyroidism').astype('category')
thyroid_cancer_train_column_filtered['Physical_Examination'] = thyroid_cancer_train_column_filtered['Physical_Examination'].map(lambda x: 'Normal or Single Nodular Goiter' if (x in ['Normal', 'Single nodular goiter-left', 'Single nodular goiter-right'])  else 'Multinodular or Diffuse Goiter').astype('category')
thyroid_cancer_train_column_filtered['Adenopathy'] = thyroid_cancer_train_column_filtered['Adenopathy'].map(lambda x: 'No' if x == 'No' else ('Yes' if pd.notna(x) and x != '' else x)).astype('category')
thyroid_cancer_train_column_filtered['Pathology'] = thyroid_cancer_train_column_filtered['Pathology'].map(lambda x: 'Non-Papillary' if (x in ['Hurthle Cell', 'Follicular'])  else 'Papillary').astype('category')
thyroid_cancer_train_column_filtered['Risk'] = thyroid_cancer_train_column_filtered['Risk'].map(lambda x: 'Low' if (x in ['Low'])  else 'Intermediate to High').astype('category')
thyroid_cancer_train_column_filtered['T'] = thyroid_cancer_train_column_filtered['T'].map(lambda x: 'T1 to T2' if (x in ['T1a', 'T1b', 'T2'])  else 'T3 to T4b').astype('category')
thyroid_cancer_train_column_filtered['N'] = thyroid_cancer_train_column_filtered['N'].map(lambda x: 'N0' if (x in ['N0'])  else 'N1').astype('category')
thyroid_cancer_train_column_filtered['Stage'] = thyroid_cancer_train_column_filtered['Stage'].map(lambda x: 'I' if (x in ['I'])  else 'II to IVB').astype('category')
thyroid_cancer_train_column_filtered['Response'] = thyroid_cancer_train_column_filtered['Response'].map(lambda x: 'Indeterminate or Incomplete' if (x in ['Indeterminate', 'Structural Incomplete', 'Biochemical Incomplete'])  else 'Excellent').astype('category')
thyroid_cancer_train_column_filtered.head()

##################################
# Performing a general exploration of the categorical variable levels
# based on the ordered categories
##################################
ordered_cat_cols = thyroid_cancer_train_column_filtered.select_dtypes(include=["category"]).columns
for col in ordered_cat_cols:
    print(f"Column: {col}")
    print("Absolute Frequencies:")
    print(thyroid_cancer_train_column_filtered[col].value_counts().reindex(thyroid_cancer_train_column_filtered[col].cat.categories))
    print("\nNormalized Frequencies:")
    print(thyroid_cancer_train_column_filtered[col].value_counts(normalize=True).reindex(thyroid_cancer_train_column_filtered[col].cat.categories))
    print("-" * 50)

Column: Gender
Absolute Frequencies:
M     44
F    160
Name: count, dtype: int64

Normalized Frequencies:
M    0.215686
F    0.784314
Name: proportion, dtype: float64
--------------------------------------------------
Column: Smoking
Absolute Frequencies:
No     177
Yes     27
Name: count, dtype: int64

Normalized Frequencies:
No     0.867647
Yes    0.132353
Name: proportion, dtype: float64
--------------------------------------------------
Column: Thyroid_Function
Absolute Frequencies:
Euthyroid                            171
Hypothyroidism or Hyperthyroidism     33
Name: count, dtype: int64

Normalized Frequencies:
Euthyroid                            0.838235
Hypothyroidism or Hyperthyroidism    0.161765
Name: proportion, dtype: float64
--------------------------------------------------
Column: Physical_Examination
Absolute Frequencies:
Multinodular or Diffuse Goiter      82
Normal or Single Nodular Goiter    122
Name: count, dtype: int64

Normalized Frequencies:
Multinodular or Diffuse Goiter     0.401961
Normal or Single Nodular Goiter    0.598039
Name: proportion, dtype: float64
--------------------------------------------------
Column: Adenopathy
Absolute Frequencies:
No     144
Yes     60
Name: count, dtype: int64

Normalized Frequencies:
No     0.705882
Yes    0.294118
Name: proportion, dtype: float64
--------------------------------------------------
Column: Pathology
Absolute Frequencies:
Non-Papillary     29
Papillary        175
Name: count, dtype: int64

Normalized Frequencies:
Non-Papillary    0.142157
Papillary        0.857843
Name: proportion, dtype: float64
--------------------------------------------------
Column: Focality
Absolute Frequencies:
Uni-Focal      129
Multi-Focal     75
Name: count, dtype: int64

Normalized Frequencies:
Uni-Focal      0.632353
Multi-Focal    0.367647
Name: proportion, dtype: float64
--------------------------------------------------
Column: Risk
Absolute Frequencies:
Intermediate to High     77
Low                     127
Name: count, dtype: int64

Normalized Frequencies:
Intermediate to High    0.377451
Low                     0.622549
Name: proportion, dtype: float64
--------------------------------------------------
Column: T
Absolute Frequencies:
T1 to T2     120
T3 to T4b     84
Name: count, dtype: int64

Normalized Frequencies:
T1 to T2     0.588235
T3 to T4b    0.411765
Name: proportion, dtype: float64
--------------------------------------------------
Column: N
Absolute Frequencies:
N0    139
N1     65
Name: count, dtype: int64

Normalized Frequencies:
N0    0.681373
N1    0.318627
Name: proportion, dtype: float64
--------------------------------------------------
Column: Stage
Absolute Frequencies:
I            174
II to IVB     30
Name: count, dtype: int64

Normalized Frequencies:
I            0.852941
II to IVB    0.147059
Name: proportion, dtype: float64
--------------------------------------------------
Column: Response
Absolute Frequencies:
Excellent                      109
Indeterminate or Incomplete     95
Name: count, dtype: int64

Normalized Frequencies:
Excellent                      0.534314
Indeterminate or Incomplete    0.465686
Name: proportion, dtype: float64
--------------------------------------------------
Column: Recurred
Absolute Frequencies:
No     143
Yes     61
Name: count, dtype: int64

Normalized Frequencies:
No     0.70098
Yes    0.29902
Name: proportion, dtype: float64
--------------------------------------------------

##################################
# Segregating the target
# and predictor variables
##################################
thyroid_cancer_train_predictors = thyroid_cancer_train_column_filtered.iloc[:,:-1].columns
thyroid_cancer_train_predictors_numeric = thyroid_cancer_train_column_filtered.iloc[:,:-1].loc[:, thyroid_cancer_train_column_filtered.iloc[:,:-1].columns == 'Age'].columns
thyroid_cancer_train_predictors_categorical = thyroid_cancer_train_column_filtered.iloc[:,:-1].loc[:,thyroid_cancer_train_column_filtered.iloc[:,:-1].columns != 'Age'].columns

##################################
# Segregating the target variable
# and categorical predictors
##################################
proportion_y_variables = thyroid_cancer_train_predictors_categorical
proportion_x_variable = 'Recurred'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 4
num_cols = 3

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 20))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(proportion_y_variables):
    ax = axes[i]
    category_counts = thyroid_cancer_train_column_filtered.groupby([proportion_x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{proportion_x_variable} Versus {y_variable}')
    ax.set_xlabel(proportion_x_variable)
    ax.set_ylabel('Proportions')
    ax.legend(loc="lower center")

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Formulating the imputed dataset
# with numeric columns only
##################################
thyroid_cancer_train_column_filtered['Age'] = pd.to_numeric(thyroid_cancer_train_column_filtered['Age'])
thyroid_cancer_train_column_filtered_numeric = thyroid_cancer_train_column_filtered.select_dtypes(include='number')
thyroid_cancer_train_column_filtered_numeric = thyroid_cancer_train_column_filtered_numeric.to_frame() if isinstance(thyroid_cancer_train_column_filtered_numeric, pd.Series) else thyroid_cancer_train_column_filtered_numeric

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = list(thyroid_cancer_train_column_filtered_numeric.columns)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = thyroid_cancer_train_column_filtered_numeric.skew()

##################################
# Computing the interquartile range
# for all columns
##################################
thyroid_cancer_train_column_filtered_numeric_q1 = thyroid_cancer_train_column_filtered_numeric.quantile(0.25)
thyroid_cancer_train_column_filtered_numeric_q3 = thyroid_cancer_train_column_filtered_numeric.quantile(0.75)
thyroid_cancer_train_column_filtered_numeric_iqr = thyroid_cancer_train_column_filtered_numeric_q3 - thyroid_cancer_train_column_filtered_numeric_q1

##################################
# Gathering the outlier count for each numeric column
# based on the interquartile range criterion
##################################
numeric_outlier_count_list = ((thyroid_cancer_train_column_filtered_numeric < (thyroid_cancer_train_column_filtered_numeric_q1 - 1.5 * thyroid_cancer_train_column_filtered_numeric_iqr)) | (thyroid_cancer_train_column_filtered_numeric > (thyroid_cancer_train_column_filtered_numeric_q3 + 1.5 * thyroid_cancer_train_column_filtered_numeric_iqr))).sum()

##################################
# Gathering the number of observations for each column
##################################
numeric_row_count_list = list([len(thyroid_cancer_train_column_filtered_numeric)] * len(thyroid_cancer_train_column_filtered_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_outlier_ratio_list = map(truediv, numeric_outlier_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = thyroid_cancer_train_column_filtered_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = thyroid_cancer_train_column_filtered_numeric.kurtosis()

##################################
# Formulating the outlier summary
# for all numeric columns
##################################
numeric_column_outlier_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                  numeric_skewness_list,
                                                  numeric_outlier_count_list,
                                                  numeric_row_count_list,
                                                  numeric_outlier_ratio_list,
                                                  numeric_skewness_list,
                                                  numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Skewness',
                                                 'Outlier.Count',
                                                 'Row.Count',
                                                 'Outlier.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_outlier_summary)

##################################
# Formulating the individual boxplots
# for all numeric columns
##################################
for column in thyroid_cancer_train_column_filtered_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=thyroid_cancer_train_column_filtered_numeric, x=column)

##################################
# Creating a dataset copy and
# converting all values to numeric
# for correlation analysis
##################################
pd.set_option('future.no_silent_downcasting', True)
thyroid_cancer_train_correlation = thyroid_cancer_train_column_filtered.copy()
thyroid_cancer_train_correlation_object = thyroid_cancer_train_correlation.iloc[:,1:13].columns
custom_category_orders = {
    'Gender': ['M', 'F'],  
    'Smoking': ['No', 'Yes'],  
    'Thyroid_Function': ['Euthyroid', 'Hypothyroidism or Hyperthyroidism'],  
    'Physical_Examination': ['Normal or Single Nodular Goiter', 'Multinodular or Diffuse Goiter'],  
    'Adenopathy': ['No', 'Yes'],  
    'Pathology': ['Non-Papillary', 'Papillary'],  
    'Focality': ['Uni-Focal', 'Multi-Focal'],  
    'Risk': ['Low', 'Intermediate to High'],  
    'T': ['T1 to T2', 'T3 to T4b'],  
    'N': ['N0', 'N1'],  
    'Stage': ['I', 'II to IVB'],  
    'Response': ['Excellent', 'Indeterminate or Incomplete'] 
}
encoder = OrdinalEncoder(categories=[custom_category_orders[col] for col in thyroid_cancer_train_correlation_object])
thyroid_cancer_train_correlation[thyroid_cancer_train_correlation_object] = encoder.fit_transform(
    thyroid_cancer_train_correlation[thyroid_cancer_train_correlation_object]
)
thyroid_cancer_train_correlation = thyroid_cancer_train_correlation.drop(['Recurred'], axis=1)
display(thyroid_cancer_train_correlation)

##################################
# Initializing the correlation matrix
##################################
thyroid_cancer_train_correlation_matrix = pd.DataFrame(np.zeros((len(thyroid_cancer_train_correlation.columns), len(thyroid_cancer_train_correlation.columns))),
                                                       columns=thyroid_cancer_train_correlation.columns,
                                                       index=thyroid_cancer_train_correlation.columns)

##################################
# Creating an empty correlation matrix
##################################
thyroid_cancer_train_correlation_matrix = pd.DataFrame(
    np.zeros((len(thyroid_cancer_train_correlation.columns), len(thyroid_cancer_train_correlation.columns))),
    index=thyroid_cancer_train_correlation.columns,
    columns=thyroid_cancer_train_correlation.columns
)


##################################
# Calculating different types
# of correlation coefficients
# per variable type
##################################
for i in range(len(thyroid_cancer_train_correlation.columns)):
    for j in range(i, len(thyroid_cancer_train_correlation.columns)):
        if i == j:
            thyroid_cancer_train_correlation_matrix.iloc[i, j] = 1.0  
        else:
            col_i = thyroid_cancer_train_correlation.iloc[:, i]
            col_j = thyroid_cancer_train_correlation.iloc[:, j]

            # Detecting binary variables (assumes binary variables are coded as 0/1)
            is_binary_i = col_i.nunique() == 2
            is_binary_j = col_j.nunique() == 2

            # Computing the Pearson correlation for two continuous variables
            if col_i.dtype in ['int64', 'float64'] and col_j.dtype in ['int64', 'float64']:
                corr = col_i.corr(col_j)

            # Computing the Point-Biserial correlation for continuous and binary variables
            elif (col_i.dtype in ['int64', 'float64'] and is_binary_j) or (col_j.dtype in ['int64', 'float64'] and is_binary_i):
                continuous_var = col_i if col_i.dtype in ['int64', 'float64'] else col_j
                binary_var = col_j if is_binary_j else col_i

                # Convert binary variable to 0/1 (if not already)
                binary_var = binary_var.astype('category').cat.codes
                corr, _ = pointbiserialr(continuous_var, binary_var)

            # Computing the Phi coefficient for two binary variables
            elif is_binary_i and is_binary_j:
                corr = col_i.corr(col_j) 

            # Computing the Cramér's V for two categorical variables (if more than 2 categories)
            else:
                contingency_table = pd.crosstab(col_i, col_j)
                chi2, _, _, _ = chi2_contingency(contingency_table)
                n = contingency_table.sum().sum()
                phi2 = chi2 / n
                r, k = contingency_table.shape
                corr = np.sqrt(phi2 / min(k - 1, r - 1))  # Cramér's V formula

            # Assigning correlation values to the matrix
            thyroid_cancer_train_correlation_matrix.iloc[i, j] = corr
            thyroid_cancer_train_correlation_matrix.iloc[j, i] = corr

# Displaying the correlation matrix
display(thyroid_cancer_train_correlation_matrix)

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric and categorical columns
##################################
plt.figure(figsize=(17, 8))
sns.heatmap(thyroid_cancer_train_correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

##################################
# Segregating the target
# and predictor variables
##################################
thyroid_cancer_train_column_filtered_predictors = thyroid_cancer_train_column_filtered.iloc[:,:-1].columns
thyroid_cancer_train_column_filtered_predictors_numeric = thyroid_cancer_train_column_filtered.iloc[:,:-1].loc[:, thyroid_cancer_train_column_filtered.iloc[:,:-1].columns == 'Age'].columns
thyroid_cancer_train_column_filtered_predictors_categorical = thyroid_cancer_train_column_filtered.iloc[:,:-1].loc[:,thyroid_cancer_train_column_filtered.iloc[:,:-1].columns != 'Age'].columns

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = thyroid_cancer_train_column_filtered_predictors_numeric

##################################
# Segregating the target variable
# and numeric predictors
##################################
boxplot_y_variable = 'Recurred'
boxplot_x_variable = numeric_variable_name_list.values[0]

##################################
# Evaluating the numeric predictors
# against the target variable
##################################
plt.figure(figsize=(7, 5))
plt.boxplot([group[boxplot_x_variable] for name, group in thyroid_cancer_train_column_filtered.groupby(boxplot_y_variable, observed=True)])
plt.title(f'{boxplot_y_variable} Versus {boxplot_x_variable}')
plt.xlabel(boxplot_y_variable)
plt.ylabel(boxplot_x_variable)
plt.xticks(range(1, len(thyroid_cancer_train_column_filtered[boxplot_y_variable].unique()) + 1), ['No', 'Yes'])
plt.show()

##################################
# Segregating the target variable
# and categorical predictors
##################################
proportion_y_variables = thyroid_cancer_train_column_filtered_predictors_categorical
proportion_x_variable = 'Recurred'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 4
num_cols = 3

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 20))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(proportion_y_variables):
    ax = axes[i]
    category_counts = thyroid_cancer_train_column_filtered.groupby([proportion_x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{proportion_x_variable} Versus {y_variable}')
    ax.set_xlabel(proportion_x_variable)
    ax.set_ylabel('Proportions')
    ax.legend(loc="lower center")

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Computing the t-test 
# statistic and p-values
# between the target variable
# and numeric predictor columns
##################################
thyroid_cancer_numeric_ttest_target = {}
thyroid_cancer_numeric = thyroid_cancer_train_column_filtered.loc[:,(thyroid_cancer_train_column_filtered.columns == 'Age') | (thyroid_cancer_train_column_filtered.columns == 'Recurred')]
thyroid_cancer_numeric_columns = thyroid_cancer_train_column_filtered_predictors_numeric
for numeric_column in thyroid_cancer_numeric_columns:
    group_0 = thyroid_cancer_numeric[thyroid_cancer_numeric.loc[:,'Recurred']=='No']
    group_1 = thyroid_cancer_numeric[thyroid_cancer_numeric.loc[:,'Recurred']=='Yes']
    thyroid_cancer_numeric_ttest_target['Recurred_' + numeric_column] = stats.ttest_ind(
        group_0[numeric_column], 
        group_1[numeric_column], 
        equal_var=True)

##################################
# Formulating the pairwise ttest summary
# between the target variable
# and numeric predictor columns
##################################
thyroid_cancer_numeric_summary = thyroid_cancer_numeric.from_dict(thyroid_cancer_numeric_ttest_target, orient='index')
thyroid_cancer_numeric_summary.columns = ['T.Test.Statistic', 'T.Test.PValue']
display(thyroid_cancer_numeric_summary.sort_values(by=['T.Test.PValue'], ascending=True).head(len(thyroid_cancer_train_column_filtered_predictors_numeric)))

##################################
# Computing the chisquare
# statistic and p-values
# between the target variable
# and categorical predictor columns
##################################
thyroid_cancer_categorical_chisquare_target = {}
thyroid_cancer_categorical = thyroid_cancer_train_column_filtered.loc[:,(thyroid_cancer_train_column_filtered.columns != 'Age') | (thyroid_cancer_train_column_filtered.columns == 'Recurred')]
thyroid_cancer_categorical_columns = thyroid_cancer_train_column_filtered_predictors_categorical
for categorical_column in thyroid_cancer_categorical_columns:
    contingency_table = pd.crosstab(thyroid_cancer_categorical[categorical_column], 
                                    thyroid_cancer_categorical['Recurred'])
    thyroid_cancer_categorical_chisquare_target['Recurred_' + categorical_column] = stats.chi2_contingency(
        contingency_table)[0:2]

##################################
# Formulating the pairwise chisquare summary
# between the target variable
# and categorical predictor columns
##################################
thyroid_cancer_categorical_summary = thyroid_cancer_categorical.from_dict(thyroid_cancer_categorical_chisquare_target, orient='index')
thyroid_cancer_categorical_summary.columns = ['ChiSquare.Test.Statistic', 'ChiSquare.Test.PValue']
display(thyroid_cancer_categorical_summary.sort_values(by=['ChiSquare.Test.PValue'], ascending=True).head(len(thyroid_cancer_train_column_filtered_predictors_categorical)))

##################################
# Formulating a preprocessing pipeline
# that removes the specified columns,
# aggregates categories in multiclass categorical variables,
# converts categorical columns to the appropriate type, and
# sets the order of category levels
##################################
def preprocess_dataset(df):
    # Removing the specified columns
    columns_to_remove = ['Hx_Smoking', 'Hx_Radiotherapy', 'M', 'N', 'Thyroid_Function', 'Pathology']
    df = df.drop(columns=columns_to_remove)
    
    # Applying category aggregation
    df['Physical_Examination'] = df['Physical_Examination'].map(
        lambda x: 'Normal or Single Nodular Goiter' if x in ['Normal', 'Single nodular goiter-left', 'Single nodular goiter-right'] 
        else 'Multinodular or Diffuse Goiter').astype('category')
    
    df['Adenopathy'] = df['Adenopathy'].map(
        lambda x: 'No' if x == 'No' else ('Yes' if pd.notna(x) and x != '' else x)).astype('category')
    
    df['Risk'] = df['Risk'].map(
        lambda x: 'Low' if x == 'Low' else 'Intermediate to High').astype('category')
    
    df['T'] = df['T'].map(
        lambda x: 'T1 to T2' if x in ['T1a', 'T1b', 'T2'] else 'T3 to T4b').astype('category')
    
    df['Stage'] = df['Stage'].map(
        lambda x: 'I' if x == 'I' else 'II to IVB').astype('category')
    
    df['Response'] = df['Response'].map(
        lambda x: 'Indeterminate or Incomplete' if x in ['Indeterminate', 'Structural Incomplete', 'Biochemical Incomplete'] 
        else 'Excellent').astype('category')
    
    # Setting category levels
    category_mappings = {
        'Gender': ['M', 'F'],
        'Smoking': ['No', 'Yes'],
        'Physical_Examination': ['Normal or Single Nodular Goiter', 'Multinodular or Diffuse Goiter'],
        'Adenopathy': ['No', 'Yes'],
        'Focality': ['Uni-Focal', 'Multi-Focal'],
        'Risk': ['Low', 'Intermediate to High'],
        'T': ['T1 to T2', 'T3 to T4b'],
        'Stage': ['I', 'II to IVB'],
        'Response': ['Excellent', 'Indeterminate or Incomplete']
    }
    
    for col, categories in category_mappings.items():
        df[col] = df[col].astype('category')
        df[col] = df[col].cat.set_categories(categories, ordered=True)
    
    return df

##################################
# Applying the preprocessing pipeline
# to the train data
##################################
thyroid_cancer_preprocessed_train = preprocess_dataset(thyroid_cancer_train)
X_preprocessed_train = thyroid_cancer_preprocessed_train.drop('Recurred', axis = 1)
y_preprocessed_train = thyroid_cancer_preprocessed_train['Recurred']
thyroid_cancer_preprocessed_train.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TRAIN_PATH, "thyroid_cancer_preprocessed_train.csv"), index=False)
X_preprocessed_train.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TRAIN_FEATURES_PATH, "X_preprocessed_train.csv"), index=False)
y_preprocessed_train.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TRAIN_TARGET_PATH, "y_preprocessed_train.csv"), index=False)
print('Final Preprocessed Train Dataset Dimensions: ')
display(X_preprocessed_train.shape)
display(y_preprocessed_train.shape)
print('Final Preprocessed Train Target Variable Breakdown: ')
display(y_preprocessed_train.value_counts())
print('Final Preprocessed Train Target Variable Proportion: ')
display(y_preprocessed_train.value_counts(normalize = True))
thyroid_cancer_preprocessed_train.head()

Final Preprocessed Train Dataset Dimensions:

(204, 10)

(204,)

Final Preprocessed Train Target Variable Breakdown:

Recurred
No     143
Yes     61
Name: count, dtype: int64

Final Preprocessed Train Target Variable Proportion:

Recurred
No     0.70098
Yes    0.29902
Name: proportion, dtype: float64

##################################
# Applying the preprocessing pipeline
# to the validation data
##################################
thyroid_cancer_preprocessed_validation = preprocess_dataset(thyroid_cancer_validation)
X_preprocessed_validation = thyroid_cancer_preprocessed_validation.drop('Recurred', axis = 1)
y_preprocessed_validation = thyroid_cancer_preprocessed_validation['Recurred']
thyroid_cancer_preprocessed_validation.to_csv(os.path.join("..", DATASETS_PREPROCESSED_VALIDATION_PATH, "thyroid_cancer_preprocessed_validation.csv"), index=False)
X_preprocessed_validation.to_csv(os.path.join("..", DATASETS_PREPROCESSED_VALIDATION_FEATURES_PATH, "X_preprocessed_validation.csv"), index=False)
y_preprocessed_validation.to_csv(os.path.join("..", DATASETS_PREPROCESSED_VALIDATION_TARGET_PATH, "y_preprocessed_validation.csv"), index=False)
print('Final Preprocessed Validation Dataset Dimensions: ')
display(X_preprocessed_validation.shape)
display(y_preprocessed_validation.shape)
print('Final Preprocessed Validation Target Variable Breakdown: ')
display(y_preprocessed_validation.value_counts())
print('Final Preprocessed Validation Target Variable Proportion: ')
display(y_preprocessed_validation.value_counts(normalize = True))
thyroid_cancer_preprocessed_validation.head()

Final Preprocessed Validation Dataset Dimensions:

(69, 10)

(69,)

Final Preprocessed Validation Target Variable Breakdown:

Recurred
No     49
Yes    20
Name: count, dtype: int64

Final Preprocessed Validation Target Variable Proportion:

Recurred
No     0.710145
Yes    0.289855
Name: proportion, dtype: float64

##################################
# Applying the preprocessing pipeline
# to the test data
##################################
thyroid_cancer_preprocessed_test = preprocess_dataset(thyroid_cancer_test)
X_preprocessed_test = thyroid_cancer_preprocessed_test.drop('Recurred', axis = 1)
y_preprocessed_test = thyroid_cancer_preprocessed_test['Recurred']
thyroid_cancer_preprocessed_test.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TEST_PATH, "thyroid_cancer_preprocessed_test.csv"), index=False)
X_preprocessed_test.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TEST_FEATURES_PATH, "X_preprocessed_test.csv"), index=False)
y_preprocessed_test.to_csv(os.path.join("..", DATASETS_PREPROCESSED_TEST_TARGET_PATH, "y_preprocessed_test.csv"), index=False)
print('Final Preprocessed Test Dataset Dimensions: ')
display(X_preprocessed_test.shape)
display(y_preprocessed_test.shape)
print('Final Preprocessed Test Target Variable Breakdown: ')
display(y_preprocessed_test.value_counts())
print('Final Preprocessed Test Target Variable Proportion: ')
display(y_preprocessed_test.value_counts(normalize = True))
thyroid_cancer_preprocessed_test.head()

Final Preprocessed Test Dataset Dimensions:

(91, 10)

(91,)

Final Preprocessed Test Target Variable Breakdown:

Recurred
No     64
Yes    27
Name: count, dtype: int64

Final Preprocessed Test Target Variable Proportion:

Recurred
No     0.703297
Yes    0.296703
Name: proportion, dtype: float64

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['Accuracy','Precision','Recall','F1','AUROC']
    metric_value = [accuracy_score(y_true, y_pred),
                   precision_score(y_true, y_pred),
                   recall_score(y_true, y_pred),
                   f1_score(y_true, y_pred),
                   roc_auc_score(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
bagged_rf_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('bagged_rf_model', RandomForestClassifier(class_weight='balanced', 
                                               random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
bagged_rf_hyperparameter_grid = {
    'bagged_rf_model__criterion': ['gini', 'entropy'],
    'bagged_rf_model__max_depth': [3, 6],
    'bagged_rf_model__min_samples_leaf': [5, 10],
    'bagged_rf_model__n_estimators': [100, 200]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
bagged_rf_grid_search = GridSearchCV(
    estimator=bagged_rf_pipeline,
    param_grid=bagged_rf_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
bagged_rf_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('bagged_rf_model',
                                        RandomForestClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_rf_model__criterion': ['gini', 'entropy'],
                         'bagged_rf_model__max_depth': [3, 6],
                         'bagged_rf_model__min_samples_leaf': [5, 10],
                         'bagged_rf_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('bagged_rf_model',
                                        RandomForestClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_rf_model__criterion': ['gini', 'entropy'],
                         'bagged_rf_model__max_depth': [3, 6],
                         'bagged_rf_model__min_samples_leaf': [5, 10],
                         'bagged_rf_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('bagged_rf_model',
                 RandomForestClassifier(class_weight='balanced',
                                        criterion='entropy', max_depth=6,
                                        min_samples_leaf=10, n_estimators=200,
                                        random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=6, min_samples_leaf=10, n_estimators=200,
                       random_state=987654321)

##################################
# Identifying the best model
##################################
bagged_rf_optimal = bagged_rf_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
bagged_rf_optimal_f1_cv = bagged_rf_grid_search.best_score_
bagged_rf_optimal_f1_train = f1_score(y_preprocessed_train_encoded, bagged_rf_optimal.predict(X_preprocessed_train))
bagged_rf_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, bagged_rf_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Bagged Model - Random Forest: ')
print(f"Best Random Forest Hyperparameters: {bagged_rf_grid_search.best_params_}")

Best Bagged Model - Random Forest: 
Best Random Forest Hyperparameters: {'bagged_rf_model__criterion': 'entropy', 'bagged_rf_model__max_depth': 6, 'bagged_rf_model__min_samples_leaf': 10, 'bagged_rf_model__n_estimators': 200}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {bagged_rf_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {bagged_rf_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, bagged_rf_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8218
F1 Score on Training Data: 0.8333

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.89      0.92       143
         1.0       0.77      0.90      0.83        61

    accuracy                           0.89       204
   macro avg       0.86      0.89      0.88       204
weighted avg       0.90      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, bagged_rf_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, bagged_rf_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Random Forest Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Random Forest Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {bagged_rf_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, bagged_rf_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8372

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        49
         1.0       0.78      0.90      0.84        20

    accuracy                           0.90        69
   macro avg       0.87      0.90      0.88        69
weighted avg       0.91      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, bagged_rf_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, bagged_rf_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Random Forest Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Random Forest Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
bagged_rf_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, bagged_rf_optimal.predict(X_preprocessed_train))
bagged_rf_optimal_train['model'] = ['bagged_rf_optimal'] * 5
bagged_rf_optimal_train['set'] = ['train'] * 5
print('Optimal Random Forest Train Performance Metrics: ')
display(bagged_rf_optimal_train)

Optimal Random Forest Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
bagged_rf_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, bagged_rf_optimal.predict(X_preprocessed_validation))
bagged_rf_optimal_validation['model'] = ['bagged_rf_optimal'] * 5
bagged_rf_optimal_validation['set'] = ['validation'] * 5
print('Optimal Random Forest Validation Performance Metrics: ')
display(bagged_rf_optimal_validation)

Optimal Random Forest Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(bagged_rf_optimal, 
            os.path.join("..", MODELS_PATH, "bagged_model_random_forest_optimal.pkl"))

['..\\models\\bagged_model_random_forest_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
bagged_et_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('bagged_et_model', ExtraTreesClassifier(class_weight='balanced', 
                                               random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
bagged_et_hyperparameter_grid = {
    'bagged_et_model__criterion': ['gini', 'entropy'],
    'bagged_et_model__max_depth': [3, 6],
    'bagged_et_model__min_samples_leaf': [5, 10],
    'bagged_et_model__n_estimators': [100, 200]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
bagged_et_grid_search = GridSearchCV(
    estimator=bagged_et_pipeline,
    param_grid=bagged_et_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
bagged_et_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('bagged_et_model',
                                        ExtraTreesClassifier(class_weight='balanced',
                                                             random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_et_model__criterion': ['gini', 'entropy'],
                         'bagged_et_model__max_depth': [3, 6],
                         'bagged_et_model__min_samples_leaf': [5, 10],
                         'bagged_et_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('bagged_et_model',
                                        ExtraTreesClassifier(class_weight='balanced',
                                                             random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_et_model__criterion': ['gini', 'entropy'],
                         'bagged_et_model__max_depth': [3, 6],
                         'bagged_et_model__min_samples_leaf': [5, 10],
                         'bagged_et_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('bagged_et_model',
                 ExtraTreesClassifier(class_weight='balanced',
                                      criterion='entropy', max_depth=6,
                                      min_samples_leaf=10, n_estimators=200,
                                      random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

ExtraTreesClassifier(class_weight='balanced', criterion='entropy', max_depth=6,
                     min_samples_leaf=10, n_estimators=200,
                     random_state=987654321)

##################################
# Identifying the best model
##################################
bagged_et_optimal = bagged_et_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
bagged_et_optimal_f1_cv = bagged_et_grid_search.best_score_
bagged_et_optimal_f1_train = f1_score(y_preprocessed_train_encoded, bagged_et_optimal.predict(X_preprocessed_train))
bagged_et_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, bagged_et_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Bagged Model – Extra Trees: ')
print(f"Best Extra Trees Hyperparameters: {bagged_et_grid_search.best_params_}")

Best Bagged Model – Extra Trees: 
Best Extra Trees Hyperparameters: {'bagged_et_model__criterion': 'entropy', 'bagged_et_model__max_depth': 6, 'bagged_et_model__min_samples_leaf': 10, 'bagged_et_model__n_estimators': 200}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {bagged_et_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {bagged_et_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, bagged_et_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8101
F1 Score on Training Data: 0.8333

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.89      0.92       143
         1.0       0.77      0.90      0.83        61

    accuracy                           0.89       204
   macro avg       0.86      0.89      0.88       204
weighted avg       0.90      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, bagged_et_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, bagged_et_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Extra Trees Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Extra Trees Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {bagged_et_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, bagged_et_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8372

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        49
         1.0       0.78      0.90      0.84        20

    accuracy                           0.90        69
   macro avg       0.87      0.90      0.88        69
weighted avg       0.91      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, bagged_et_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, bagged_et_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Extra Trees Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Extra Trees Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
bagged_et_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, bagged_et_optimal.predict(X_preprocessed_train))
bagged_et_optimal_train['model'] = ['bagged_et_optimal'] * 5
bagged_et_optimal_train['set'] = ['train'] * 5
print('Optimal Extra Trees Train Performance Metrics: ')
display(bagged_et_optimal_train)

Optimal Extra Trees Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
bagged_et_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, bagged_et_optimal.predict(X_preprocessed_validation))
bagged_et_optimal_validation['model'] = ['bagged_et_optimal'] * 5
bagged_et_optimal_validation['set'] = ['validation'] * 5
print('Optimal Extra Trees Validation Performance Metrics: ')
display(bagged_et_optimal_validation)

Optimal Extra Trees Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(bagged_et_optimal, 
            os.path.join("..", MODELS_PATH, "bagged_model_extra_trees_optimal.pkl"))

['..\\models\\bagged_model_extra_trees_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
bagged_bdt_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('bagged_bdt_model', BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced', 
                                                                            random_state=987654321),
                                           random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
bagged_bdt_hyperparameter_grid = {
    'bagged_bdt_model__estimator__criterion': ['gini', 'entropy'],
    'bagged_bdt_model__estimator__max_depth': [3, 6],
    'bagged_bdt_model__estimator__min_samples_leaf': [5, 10],
    'bagged_bdt_model__n_estimators': [100, 200]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
bagged_bdt_grid_search = GridSearchCV(
    estimator=bagged_bdt_pipeline,
    param_grid=bagged_bdt_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
bagged_bdt_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                                           random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_bdt_model__estimator__criterion': ['gini',
                                                                    'entropy'],
                         'bagged_bdt_model__estimator__max_depth': [3, 6],
                         'bagged_bdt_model__estimator__min_samples_leaf': [5,
                                                                           10],
                         'bagged_bdt_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                                           random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_bdt_model__estimator__criterion': ['gini',
                                                                    'entropy'],
                         'bagged_bdt_model__estimator__max_depth': [3, 6],
                         'bagged_bdt_model__estimator__min_samples_leaf': [5,
                                                                           10],
                         'bagged_bdt_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('bagged_bdt_model',
                 BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                    max_depth=6,
                                                                    min_samples_leaf=5,
                                                                    random_state=987654321),
                                   n_estimators=200, random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

BaggingClassifier(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                   max_depth=6,
                                                   min_samples_leaf=5,
                                                   random_state=987654321),
                  n_estimators=200, random_state=987654321)

##################################
# Identifying the best model
##################################
bagged_bdt_optimal = bagged_bdt_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
bagged_bdt_optimal_f1_cv = bagged_bdt_grid_search.best_score_
bagged_bdt_optimal_f1_train = f1_score(y_preprocessed_train_encoded, bagged_bdt_optimal.predict(X_preprocessed_train))
bagged_bdt_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, bagged_bdt_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Bagged Model – Bagged Decision Trees: ')
print(f"Best Bagged Decision Trees Hyperparameters: {bagged_bdt_grid_search.best_params_}")

Best Bagged Model – Bagged Decision Trees: 
Best Bagged Decision Trees Hyperparameters: {'bagged_bdt_model__estimator__criterion': 'gini', 'bagged_bdt_model__estimator__max_depth': 6, 'bagged_bdt_model__estimator__min_samples_leaf': 5, 'bagged_bdt_model__n_estimators': 200}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {bagged_bdt_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {bagged_bdt_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, bagged_bdt_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8287
F1 Score on Training Data: 0.8462

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93       143
         1.0       0.80      0.90      0.85        61

    accuracy                           0.90       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, bagged_bdt_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, bagged_bdt_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Decision Trees Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Decision Trees Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {bagged_bdt_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, bagged_bdt_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, bagged_bdt_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, bagged_bdt_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Decision Trees Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Decision Trees Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
bagged_bdt_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, bagged_bdt_optimal.predict(X_preprocessed_train))
bagged_bdt_optimal_train['model'] = ['bagged_bdt_optimal'] * 5
bagged_bdt_optimal_train['set'] = ['train'] * 5
print('Optimal Bagged Decision Trees Train Performance Metrics: ')
display(bagged_bdt_optimal_train)

Optimal Bagged Decision Trees Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
bagged_bdt_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, bagged_bdt_optimal.predict(X_preprocessed_validation))
bagged_bdt_optimal_validation['model'] = ['bagged_bdt_optimal'] * 5
bagged_bdt_optimal_validation['set'] = ['validation'] * 5
print('Optimal Bagged Decision Trees Validation Performance Metrics: ')
display(bagged_bdt_optimal_validation)

Optimal Bagged Decision Trees Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(bagged_bdt_optimal, 
            os.path.join("..", MODELS_PATH, "bagged_model_bagged_decision_trees_optimal.pkl"))

['..\\models\\bagged_model_bagged_decision_trees_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
bagged_blr_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('bagged_blr_model', BaggingClassifier(estimator=LogisticRegression(class_weight='balanced', 
                                                                        random_state=987654321),
                                           random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
bagged_blr_hyperparameter_grid = {
    'bagged_blr_model__estimator__C': [0.1, 1.0],
    'bagged_blr_model__estimator__penalty': ['l1', 'l2'],
    'bagged_blr_model__estimator__solver': ['liblinear', 'saga'],
    'bagged_blr_model__n_estimators': [100, 200]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
bagged_blr_grid_search = GridSearchCV(
    estimator=bagged_blr_pipeline,
    param_grid=bagged_blr_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
bagged_blr_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                                                       random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_blr_model__estimator__C': [0.1, 1.0],
                         'bagged_blr_model__estimator__penalty': ['l1', 'l2'],
                         'bagged_blr_model__estimator__solver': ['liblinear',
                                                                 'saga'],
                         'bagged_blr_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                                                       random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_blr_model__estimator__C': [0.1, 1.0],
                         'bagged_blr_model__estimator__penalty': ['l1', 'l2'],
                         'bagged_blr_model__estimator__solver': ['liblinear',
                                                                 'saga'],
                         'bagged_blr_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('bagged_blr_model',
                 BaggingClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                                penalty='l1',
                                                                random_state=987654321,
                                                                solver='liblinear'),
                                   n_estimators=200, random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

BaggingClassifier(estimator=LogisticRegression(class_weight='balanced',
                                               penalty='l1',
                                               random_state=987654321,
                                               solver='liblinear'),
                  n_estimators=200, random_state=987654321)

##################################
# Identifying the best model
##################################
bagged_blr_optimal = bagged_blr_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
bagged_blr_optimal_f1_cv = bagged_blr_grid_search.best_score_
bagged_blr_optimal_f1_train = f1_score(y_preprocessed_train_encoded, bagged_blr_optimal.predict(X_preprocessed_train))
bagged_blr_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, bagged_blr_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Bagged Model – Bagged Logistic Regression: ')
print(f"Best Bagged Logistic Regression Hyperparameters: {bagged_blr_grid_search.best_params_}")

Best Bagged Model – Bagged Logistic Regression: 
Best Bagged Logistic Regression Hyperparameters: {'bagged_blr_model__estimator__C': 1.0, 'bagged_blr_model__estimator__penalty': 'l1', 'bagged_blr_model__estimator__solver': 'liblinear', 'bagged_blr_model__n_estimators': 200}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {bagged_blr_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {bagged_blr_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, bagged_blr_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8213
F1 Score on Training Data: 0.8333

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.89      0.92       143
         1.0       0.77      0.90      0.83        61

    accuracy                           0.89       204
   macro avg       0.86      0.89      0.88       204
weighted avg       0.90      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, bagged_blr_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, bagged_blr_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Logistic Regression Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Logistic Regression Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {bagged_blr_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, bagged_blr_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8372

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        49
         1.0       0.78      0.90      0.84        20

    accuracy                           0.90        69
   macro avg       0.87      0.90      0.88        69
weighted avg       0.91      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, bagged_blr_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, bagged_blr_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Logistic Regression Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Logistic Regression Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
bagged_blr_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, bagged_blr_optimal.predict(X_preprocessed_train))
bagged_blr_optimal_train['model'] = ['bagged_blr_optimal'] * 5
bagged_blr_optimal_train['set'] = ['train'] * 5
print('Optimal Bagged Logistic Regression Train Performance Metrics: ')
display(bagged_blr_optimal_train)

Optimal Bagged Logistic Regression Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
bagged_blr_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, bagged_blr_optimal.predict(X_preprocessed_validation))
bagged_blr_optimal_validation['model'] = ['bagged_blr_optimal'] * 5
bagged_blr_optimal_validation['set'] = ['validation'] * 5
print('Optimal Bagged Logistic Regression Validation Performance Metrics: ')
display(bagged_blr_optimal_validation)

Optimal Bagged Logistic Regression Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(bagged_blr_optimal, 
            os.path.join("..", MODELS_PATH, "bagged_model_bagged_logistic_regression_optimal.pkl"))

['..\\models\\bagged_model_bagged_logistic_regression_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
bagged_bsvm_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('bagged_bsvm_model', BaggingClassifier(estimator=SVC(class_weight='balanced', 
                                                          random_state=987654321),
                                            random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
bagged_bsvm_hyperparameter_grid = {
    'bagged_bsvm_model__estimator__C': [0.1, 1.0],
    'bagged_bsvm_model__estimator__kernel': ['linear', 'rbf'],
    'bagged_bsvm_model__estimator__gamma': ['scale','auto'],
    'bagged_bsvm_model__n_estimators': [100, 200]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
bagged_bsvm_grid_search = GridSearchCV(
    estimator=bagged_bsvm_pipeline,
    param_grid=bagged_bsvm_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
bagged_bsvm_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=SVC(class_weight='balanced',
                                                                        random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_bsvm_model__estimator__C': [0.1, 1.0],
                         'bagged_bsvm_model__estimator__gamma': ['scale',
                                                                 'auto'],
                         'bagged_bsvm_model__estimator__kernel': ['linear',
                                                                  'rbf'],
                         'bagged_bsvm_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                        BaggingClassifier(estimator=SVC(class_weight='balanced',
                                                                        random_state=987654321),
                                                          random_state=987654321))]),
             n_jobs=-1,
             param_grid={'bagged_bsvm_model__estimator__C': [0.1, 1.0],
                         'bagged_bsvm_model__estimator__gamma': ['scale',
                                                                 'auto'],
                         'bagged_bsvm_model__estimator__kernel': ['linear',
                                                                  'rbf'],
                         'bagged_bsvm_model__n_estimators': [100, 200]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('bagged_bsvm_model',
                 BaggingClassifier(estimator=SVC(class_weight='balanced',
                                                 kernel='linear',
                                                 random_state=987654321),
                                   n_estimators=100, random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

BaggingClassifier(estimator=SVC(class_weight='balanced', kernel='linear',
                                random_state=987654321),
                  n_estimators=100, random_state=987654321)

##################################
# Identifying the best model
##################################
bagged_bsvm_optimal = bagged_bsvm_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
bagged_bsvm_optimal_f1_cv = bagged_bsvm_grid_search.best_score_
bagged_bsvm_optimal_f1_train = f1_score(y_preprocessed_train_encoded, bagged_bsvm_optimal.predict(X_preprocessed_train))
bagged_bsvm_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, bagged_bsvm_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Bagged Model – Bagged Support Vector Machine: ')
print(f"Best Bagged Support Vector Machine Hyperparameters: {bagged_bsvm_grid_search.best_params_}")

Best Bagged Model – Bagged Support Vector Machine: 
Best Bagged Support Vector Machine Hyperparameters: {'bagged_bsvm_model__estimator__C': 1.0, 'bagged_bsvm_model__estimator__gamma': 'scale', 'bagged_bsvm_model__estimator__kernel': 'linear', 'bagged_bsvm_model__n_estimators': 100}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {bagged_bsvm_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {bagged_bsvm_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, bagged_bsvm_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8209
F1 Score on Training Data: 0.8527

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.91      0.93       143
         1.0       0.81      0.90      0.85        61

    accuracy                           0.91       204
   macro avg       0.88      0.91      0.89       204
weighted avg       0.91      0.91      0.91       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, bagged_bsvm_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, bagged_bsvm_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Support Vector Machine Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Support Vector Machine Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {bagged_bsvm_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, bagged_bsvm_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, bagged_bsvm_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, bagged_bsvm_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Bagged Support Vector Machine Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Bagged Support Vector Machine Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
bagged_bsvm_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, bagged_bsvm_optimal.predict(X_preprocessed_train))
bagged_bsvm_optimal_train['model'] = ['bagged_bsvm_optimal'] * 5
bagged_bsvm_optimal_train['set'] = ['train'] * 5
print('Optimal Bagged Support Vector Machine Train Performance Metrics: ')
display(bagged_bsvm_optimal_train)

Optimal Bagged Support Vector Machine Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
bagged_bsvm_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, bagged_bsvm_optimal.predict(X_preprocessed_validation))
bagged_bsvm_optimal_validation['model'] = ['bagged_bsvm_optimal'] * 5
bagged_bsvm_optimal_validation['set'] = ['validation'] * 5
print('Optimal Bagged Support Vector Machine Validation Performance Metrics: ')
display(bagged_bsvm_optimal_validation)

Optimal Bagged Support Vector Machine Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(bagged_bsvm_optimal, 
            os.path.join("..", MODELS_PATH, "bagged_model_bagged_svm_optimal.pkl"))

['..\\models\\bagged_model_bagged_svm_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
boosted_ab_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('boosted_ab_model', AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=987654321),
                                            random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
boosted_ab_hyperparameter_grid = {
    'boosted_ab_model__learning_rate': [0.01, 0.10],  
    'boosted_ab_model__estimator__max_depth': [1, 2],
    'boosted_ab_model__n_estimators': [50, 100] 
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
boosted_ab_grid_search = GridSearchCV(
    estimator=boosted_ab_pipeline,
    param_grid=boosted_ab_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
boosted_ab_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_ab_model',
                                        AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=987654321),
                                                           random_state=987654321))]),
             n_jobs=-1,
             param_grid={'boosted_ab_model__estimator__max_depth': [1, 2],
                         'boosted_ab_model__learning_rate': [0.01, 0.1],
                         'boosted_ab_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_ab_model',
                                        AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=987654321),
                                                           random_state=987654321))]),
             n_jobs=-1,
             param_grid={'boosted_ab_model__estimator__max_depth': [1, 2],
                         'boosted_ab_model__learning_rate': [0.01, 0.1],
                         'boosted_ab_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('boosted_ab_model',
                 AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2,
                                                                     random_state=987654321),
                                    learning_rate=0.01,
                                    random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2,
                                                    random_state=987654321),
                   learning_rate=0.01, random_state=987654321)

##################################
# Identifying the best model
##################################
boosted_ab_optimal = boosted_ab_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
boosted_ab_optimal_f1_cv = boosted_ab_grid_search.best_score_
boosted_ab_optimal_f1_train = f1_score(y_preprocessed_train_encoded, boosted_ab_optimal.predict(X_preprocessed_train))
boosted_ab_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, boosted_ab_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Boosted Model - AdaBoost: ')
print(f"Best AdaBoost Hyperparameters: {boosted_ab_grid_search.best_params_}")

Best Boosted Model - AdaBoost: 
Best AdaBoost Hyperparameters: {'boosted_ab_model__estimator__max_depth': 2, 'boosted_ab_model__learning_rate': 0.01, 'boosted_ab_model__n_estimators': 50}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {boosted_ab_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {boosted_ab_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, boosted_ab_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8364
F1 Score on Training Data: 0.8438

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       143
         1.0       0.81      0.89      0.84        61

    accuracy                           0.90       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, boosted_ab_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, boosted_ab_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal AdaBoost Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal AdaBoost Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {boosted_ab_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, boosted_ab_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, boosted_ab_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, boosted_ab_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal AdaBoost Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal AdaBoost Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
boosted_ab_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, boosted_ab_optimal.predict(X_preprocessed_train))
boosted_ab_optimal_train['model'] = ['boosted_ab_optimal'] * 5
boosted_ab_optimal_train['set'] = ['train'] * 5
print('Optimal AdaBoost Train Performance Metrics: ')
display(boosted_ab_optimal_train)

Optimal AdaBoost Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
boosted_ab_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, boosted_ab_optimal.predict(X_preprocessed_validation))
boosted_ab_optimal_validation['model'] = ['boosted_ab_optimal'] * 5
boosted_ab_optimal_validation['set'] = ['validation'] * 5
print('Optimal AdaBoost Validation Performance Metrics: ')
display(boosted_ab_optimal_validation)

Optimal AdaBoost Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(boosted_ab_optimal, 
            os.path.join("..", MODELS_PATH, "boosted_model_adaboost_optimal.pkl"))

['..\\models\\boosted_model_adaboost_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
boosted_gb_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('boosted_gb_model', GradientBoostingClassifier(random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
boosted_gb_hyperparameter_grid = {
    'boosted_gb_model__learning_rate': [0.01, 0.10],
    'boosted_gb_model__max_depth': [3, 6], 
    'boosted_gb_model__min_samples_leaf': [5, 10],
    'boosted_gb_model__n_estimators': [50, 100] 
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
boosted_gb_grid_search = GridSearchCV(
    estimator=boosted_gb_pipeline,
    param_grid=boosted_gb_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
boosted_gb_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_gb_model',
                                        GradientBoostingClassifier(random_state=987654321))]),
             n_jobs=-1,
             param_grid={'boosted_gb_model__learning_rate': [0.01, 0.1],
                         'boosted_gb_model__max_depth': [3, 6],
                         'boosted_gb_model__min_samples_leaf': [5, 10],
                         'boosted_gb_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_gb_model',
                                        GradientBoostingClassifier(random_state=987654321))]),
             n_jobs=-1,
             param_grid={'boosted_gb_model__learning_rate': [0.01, 0.1],
                         'boosted_gb_model__max_depth': [3, 6],
                         'boosted_gb_model__min_samples_leaf': [5, 10],
                         'boosted_gb_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('boosted_gb_model',
                 GradientBoostingClassifier(min_samples_leaf=10,
                                            n_estimators=50,
                                            random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

GradientBoostingClassifier(min_samples_leaf=10, n_estimators=50,
                           random_state=987654321)

##################################
# Identifying the best model
##################################
boosted_gb_optimal = boosted_gb_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
boosted_gb_optimal_f1_cv = boosted_gb_grid_search.best_score_
boosted_gb_optimal_f1_train = f1_score(y_preprocessed_train_encoded, boosted_gb_optimal.predict(X_preprocessed_train))
boosted_gb_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, boosted_gb_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Boosted Model - Gradient Boosting: ')
print(f"Best Gradient Boosting Hyperparameters: {boosted_gb_grid_search.best_params_}")

Best Boosted Model - Gradient Boosting: 
Best Gradient Boosting Hyperparameters: {'boosted_gb_model__learning_rate': 0.1, 'boosted_gb_model__max_depth': 3, 'boosted_gb_model__min_samples_leaf': 10, 'boosted_gb_model__n_estimators': 50}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {boosted_gb_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {boosted_gb_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, boosted_gb_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8131
F1 Score on Training Data: 0.9106

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       143
         1.0       0.90      0.92      0.91        61

    accuracy                           0.95       204
   macro avg       0.93      0.94      0.94       204
weighted avg       0.95      0.95      0.95       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, boosted_gb_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, boosted_gb_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Gradient Boosting Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Gradient Boosting Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {boosted_gb_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, boosted_gb_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8293

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.94      0.92      0.93        49
         1.0       0.81      0.85      0.83        20

    accuracy                           0.90        69
   macro avg       0.87      0.88      0.88        69
weighted avg       0.90      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, boosted_gb_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, boosted_gb_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Gradient Boosting Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Gradient Boosting Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
boosted_gb_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, boosted_gb_optimal.predict(X_preprocessed_train))
boosted_gb_optimal_train['model'] = ['boosted_gb_optimal'] * 5
boosted_gb_optimal_train['set'] = ['train'] * 5
print('Optimal Gradient Boosting Train Performance Metrics: ')
display(boosted_gb_optimal_train)

Optimal Gradient Boosting Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
boosted_gb_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, boosted_gb_optimal.predict(X_preprocessed_validation))
boosted_gb_optimal_validation['model'] = ['boosted_gb_optimal'] * 5
boosted_gb_optimal_validation['set'] = ['validation'] * 5
print('Optimal Gradient Boosting Validation Performance Metrics: ')
display(boosted_gb_optimal_validation)

Optimal Gradient Boosting Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(boosted_gb_optimal, 
            os.path.join("..", MODELS_PATH, "boosted_model_gradient_boosting_optimal.pkl"))

['..\\models\\boosted_model_gradient_boosting_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
boosted_xgb_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('boosted_xgb_model', XGBClassifier(scale_pos_weight=2.0, 
                                        random_state=987654321,
                                        subsample=0.7,
                                        colsample_bytree=0.7,
                                        eval_metric='logloss'))
])

##################################
# Defining hyperparameter grid
##################################
boosted_xgb_hyperparameter_grid = {
    'boosted_xgb_model__learning_rate': [0.01, 0.10],
    'boosted_xgb_model__max_depth': [3, 6], 
    'boosted_xgb_model__gamma': [0.1, 0.2],
    'boosted_xgb_model__n_estimators': [50, 100] 
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
boosted_xgb_grid_search = GridSearchCV(
    estimator=boosted_xgb_pipeline,
    param_grid=boosted_xgb_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
boosted_xgb_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      multi_strategy=None,
                                                      n_estimators=None,
                                                      n_jobs=None,
                                                      num_parallel_tree=None,
                                                      random_state=987654321, ...))]),
             n_jobs=-1,
             param_grid={'boosted_xgb_model__gamma': [0.1, 0.2],
                         'boosted_xgb_model__learning_rate': [0.01, 0.1],
                         'boosted_xgb_model__max_depth': [3, 6],
                         'boosted_xgb_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                                      missing=nan,
                                                      monotone_constraints=None,
                                                      multi_strategy=None,
                                                      n_estimators=None,
                                                      n_jobs=None,
                                                      num_parallel_tree=None,
                                                      random_state=987654321, ...))]),
             n_jobs=-1,
             param_grid={'boosted_xgb_model__gamma': [0.1, 0.2],
                         'boosted_xgb_model__learning_rate': [0.01, 0.1],
                         'boosted_xgb_model__max_depth': [3, 6],
                         'boosted_xgb_model__n_estimators': [50, 100]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('boosted_xgb_model',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_byle...
                               feature_types=None, gamma=0.1, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.01,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=3, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=50, n_jobs=None,
                               num_parallel_tree=None, random_state=987654321, ...))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.01, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=50,
              n_jobs=None, num_parallel_tree=None, random_state=987654321, ...)

##################################
# Identifying the best model
##################################
boosted_xgb_optimal = boosted_xgb_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
boosted_xgb_optimal_f1_cv = boosted_xgb_grid_search.best_score_
boosted_xgb_optimal_f1_train = f1_score(y_preprocessed_train_encoded, boosted_xgb_optimal.predict(X_preprocessed_train))
boosted_xgb_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, boosted_xgb_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Boosted Model - XGBoost: ')
print(f"Best XGBoost Hyperparameters: {boosted_xgb_grid_search.best_params_}")

Best Boosted Model - XGBoost: 
Best XGBoost Hyperparameters: {'boosted_xgb_model__gamma': 0.1, 'boosted_xgb_model__learning_rate': 0.01, 'boosted_xgb_model__max_depth': 3, 'boosted_xgb_model__n_estimators': 50}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {boosted_xgb_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {boosted_xgb_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, boosted_xgb_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8322
F1 Score on Training Data: 0.8504

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.92      0.93       143
         1.0       0.82      0.89      0.85        61

    accuracy                           0.91       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.91      0.91       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, boosted_xgb_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, boosted_xgb_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal XGBoost Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal XGBoost Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {boosted_xgb_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, boosted_xgb_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, boosted_xgb_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, boosted_xgb_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal XGBoost Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal XGBoost Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
boosted_xgb_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, boosted_xgb_optimal.predict(X_preprocessed_train))
boosted_xgb_optimal_train['model'] = ['boosted_xgb_optimal'] * 5
boosted_xgb_optimal_train['set'] = ['train'] * 5
print('Optimal XGBoost Train Performance Metrics: ')
display(boosted_xgb_optimal_train)

Optimal XGBoost Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
boosted_xgb_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, boosted_xgb_optimal.predict(X_preprocessed_validation))
boosted_xgb_optimal_validation['model'] = ['boosted_xgb_optimal'] * 5
boosted_xgb_optimal_validation['set'] = ['validation'] * 5
print('Optimal XGBoost Validation Performance Metrics: ')
display(boosted_xgb_optimal_validation)

Optimal XGBoost Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(boosted_xgb_optimal, 
            os.path.join("..", MODELS_PATH, "boosted_model_xgboost_optimal.pkl"))

['..\\models\\boosted_model_xgboost_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
boosted_lgbm_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('boosted_lgbm_model', LGBMClassifier(scale_pos_weight=2.0, 
                                          random_state=987654321,
                                          max_depth=-1,
                                          feature_fraction =0.7,
                                          bagging_fraction=0.7,
                                          verbose=-1))
])

##################################
# Defining hyperparameter grid
##################################
boosted_lgbm_hyperparameter_grid = {
    'boosted_lgbm_model__learning_rate': [0.01, 0.10],
    'boosted_lgbm_model__min_child_samples': [3, 6], 
    'boosted_lgbm_model__num_leaves': [8, 16],
    'boosted_lgbm_model__n_estimators': [50, 100] 
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
boosted_lgbm_grid_search = GridSearchCV(
    estimator=boosted_lgbm_pipeline,
    param_grid=boosted_lgbm_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
boosted_lgbm_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                       ('boosted_lgbm_model',
                                        LGBMClassifier(bagging_fraction=0.7,
                                                       feature_fraction=0.7,
                                                       random_state=987654321,
                                                       scale_pos_weight=2.0,
                                                       verbose=-1))]),
             n_jobs=-1,
             param_grid={'boosted_lgbm_model__learning_rate': [0.01, 0.1],
                         'boosted_lgbm_model__min_child_samples': [3, 6],
                         'boosted_lgbm_model__n_estimators': [50, 100],
                         'boosted_lgbm_model__num_leaves': [8, 16]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])...
                                       ('boosted_lgbm_model',
                                        LGBMClassifier(bagging_fraction=0.7,
                                                       feature_fraction=0.7,
                                                       random_state=987654321,
                                                       scale_pos_weight=2.0,
                                                       verbose=-1))]),
             n_jobs=-1,
             param_grid={'boosted_lgbm_model__learning_rate': [0.01, 0.1],
                         'boosted_lgbm_model__min_child_samples': [3, 6],
                         'boosted_lgbm_model__n_estimators': [50, 100],
                         'boosted_lgbm_model__num_leaves': [8, 16]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('boosted_lgbm_model',
                 LGBMClassifier(bagging_fraction=0.7, feature_fraction=0.7,
                                learning_rate=0.01, min_child_samples=6,
                                num_leaves=16, random_state=987654321,
                                scale_pos_weight=2.0, verbose=-1))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

LGBMClassifier(bagging_fraction=0.7, feature_fraction=0.7, learning_rate=0.01,
               min_child_samples=6, num_leaves=16, random_state=987654321,
               scale_pos_weight=2.0, verbose=-1)

##################################
# Identifying the best model
##################################
boosted_lgbm_optimal = boosted_lgbm_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.utils.validation')
boosted_lgbm_optimal_f1_cv = boosted_lgbm_grid_search.best_score_
boosted_lgbm_optimal_f1_train = f1_score(y_preprocessed_train_encoded, boosted_lgbm_optimal.predict(X_preprocessed_train))
boosted_lgbm_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, boosted_lgbm_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Boosted Model - Light GBM: ')
print(f"Best Light GBM Hyperparameters: {boosted_lgbm_grid_search.best_params_}")

Best Boosted Model - Light GBM: 
Best Light GBM Hyperparameters: {'boosted_lgbm_model__learning_rate': 0.01, 'boosted_lgbm_model__min_child_samples': 6, 'boosted_lgbm_model__n_estimators': 100, 'boosted_lgbm_model__num_leaves': 16}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {boosted_lgbm_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {boosted_lgbm_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, boosted_lgbm_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8182
F1 Score on Training Data: 0.8943

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.95      0.95       143
         1.0       0.89      0.90      0.89        61

    accuracy                           0.94       204
   macro avg       0.92      0.93      0.92       204
weighted avg       0.94      0.94      0.94       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, boosted_lgbm_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, boosted_lgbm_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Light GBM Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Light GBM Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {boosted_lgbm_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, boosted_lgbm_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8205

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.92      0.94      0.93        49
         1.0       0.84      0.80      0.82        20

    accuracy                           0.90        69
   macro avg       0.88      0.87      0.87        69
weighted avg       0.90      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, boosted_lgbm_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, boosted_lgbm_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Light GBM Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Light GBM Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
boosted_lgbm_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, boosted_lgbm_optimal.predict(X_preprocessed_train))
boosted_lgbm_optimal_train['model'] = ['boosted_lgbm_optimal'] * 5
boosted_lgbm_optimal_train['set'] = ['train'] * 5
print('Optimal Light GBM Train Performance Metrics: ')
display(boosted_lgbm_optimal_train)

Optimal Light GBM Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
boosted_lgbm_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, boosted_lgbm_optimal.predict(X_preprocessed_validation))
boosted_lgbm_optimal_validation['model'] = ['boosted_lgbm_optimal'] * 5
boosted_lgbm_optimal_validation['set'] = ['validation'] * 5
print('Optimal Light GBM Validation Performance Metrics: ')
display(boosted_lgbm_optimal_validation)

Optimal Light GBM Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(boosted_lgbm_optimal, 
            os.path.join("..", MODELS_PATH, "boosted_model_light_gbm_optimal.pkl"))

['..\\models\\boosted_model_light_gbm_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
boosted_cb_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('boosted_cb_model', CatBoostClassifier(scale_pos_weight=2.0, 
                                            random_state=987654321,
                                            subsample =0.7,
                                            colsample_bylevel=0.7,
                                           grow_policy='Lossguide'))
])

##################################
# Defining hyperparameter grid
##################################
boosted_cb_hyperparameter_grid = {
    'boosted_cb_model__learning_rate': [0.01, 0.10],
    'boosted_cb_model__max_depth': [3, 6], 
    'boosted_cb_model__num_leaves': [8, 16],
    'boosted_cb_model__iterations': [50, 100] 
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
boosted_cb_grid_search = GridSearchCV(
    estimator=boosted_cb_pipeline,
    param_grid=boosted_cb_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
boosted_cb_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 16 candidates, totalling 400 fits
0:	learn: 0.6891722	total: 117ms	remaining: 5.73s
1:	learn: 0.6834783	total: 118ms	remaining: 2.83s
2:	learn: 0.6782963	total: 119ms	remaining: 1.86s
3:	learn: 0.6734680	total: 120ms	remaining: 1.38s
4:	learn: 0.6687357	total: 121ms	remaining: 1.09s
5:	learn: 0.6634680	total: 122ms	remaining: 894ms
6:	learn: 0.6585557	total: 123ms	remaining: 754ms
7:	learn: 0.6543455	total: 124ms	remaining: 650ms
8:	learn: 0.6494274	total: 124ms	remaining: 567ms
9:	learn: 0.6445245	total: 125ms	remaining: 501ms
10:	learn: 0.6403235	total: 126ms	remaining: 447ms
11:	learn: 0.6356199	total: 127ms	remaining: 402ms
12:	learn: 0.6312758	total: 128ms	remaining: 363ms
13:	learn: 0.6272985	total: 129ms	remaining: 331ms
14:	learn: 0.6234670	total: 130ms	remaining: 302ms
15:	learn: 0.6188170	total: 130ms	remaining: 277ms
16:	learn: 0.6149020	total: 131ms	remaining: 255ms
17:	learn: 0.6107420	total: 132ms	remaining: 235ms
18:	learn: 0.6069101	total: 132ms	remaining: 216ms
19:	learn: 0.6029967	total: 133ms	remaining: 200ms
20:	learn: 0.5990690	total: 134ms	remaining: 185ms
21:	learn: 0.5950791	total: 135ms	remaining: 172ms
22:	learn: 0.5910606	total: 135ms	remaining: 159ms
23:	learn: 0.5872759	total: 136ms	remaining: 147ms
24:	learn: 0.5831229	total: 137ms	remaining: 137ms
25:	learn: 0.5800303	total: 138ms	remaining: 127ms
26:	learn: 0.5767067	total: 139ms	remaining: 118ms
27:	learn: 0.5733769	total: 139ms	remaining: 110ms
28:	learn: 0.5702532	total: 140ms	remaining: 101ms
29:	learn: 0.5673687	total: 141ms	remaining: 93.9ms
30:	learn: 0.5645879	total: 142ms	remaining: 86.8ms
31:	learn: 0.5613671	total: 142ms	remaining: 80ms
32:	learn: 0.5583988	total: 143ms	remaining: 73.7ms
33:	learn: 0.5553886	total: 144ms	remaining: 67.6ms
34:	learn: 0.5518851	total: 144ms	remaining: 61.8ms
35:	learn: 0.5491829	total: 145ms	remaining: 56.4ms
36:	learn: 0.5464052	total: 146ms	remaining: 51.3ms
37:	learn: 0.5437216	total: 147ms	remaining: 46.4ms
38:	learn: 0.5410767	total: 148ms	remaining: 41.7ms
39:	learn: 0.5383734	total: 148ms	remaining: 37.1ms
40:	learn: 0.5354526	total: 149ms	remaining: 32.7ms
41:	learn: 0.5326437	total: 150ms	remaining: 28.5ms
42:	learn: 0.5296890	total: 151ms	remaining: 24.5ms
43:	learn: 0.5267096	total: 152ms	remaining: 20.7ms
44:	learn: 0.5244612	total: 153ms	remaining: 17ms
45:	learn: 0.5216877	total: 154ms	remaining: 13.3ms
46:	learn: 0.5186363	total: 154ms	remaining: 9.84ms
47:	learn: 0.5158441	total: 155ms	remaining: 6.46ms
48:	learn: 0.5132195	total: 156ms	remaining: 3.18ms
49:	learn: 0.5104675	total: 157ms	remaining: 0us

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_cb_model',
                                        <catboost.core.CatBoostClassifier object at 0x0000011429733560>)]),
             n_jobs=-1,
             param_grid={'boosted_cb_model__iterations': [50, 100],
                         'boosted_cb_model__learning_rate': [0.01, 0.1],
                         'boosted_cb_model__max_depth': [3, 6],
                         'boosted_cb_model__num_leaves': [8, 16]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('boosted_cb_model',
                                        <catboost.core.CatBoostClassifier object at 0x0000011429733560>)]),
             n_jobs=-1,
             param_grid={'boosted_cb_model__iterations': [50, 100],
                         'boosted_cb_model__learning_rate': [0.01, 0.1],
                         'boosted_cb_model__max_depth': [3, 6],
                         'boosted_cb_model__num_leaves': [8, 16]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('boosted_cb_model',
                 <catboost.core.CatBoostClassifier object at 0x0000011428FE71A0>)])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

<catboost.core.CatBoostClassifier object at 0x0000011428FE71A0>

##################################
# Identifying the best model
##################################
boosted_cb_optimal = boosted_cb_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
boosted_cb_optimal_f1_cv = boosted_cb_grid_search.best_score_
boosted_cb_optimal_f1_train = f1_score(y_preprocessed_train_encoded, boosted_cb_optimal.predict(X_preprocessed_train))
boosted_cb_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, boosted_cb_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Boosted Model - CatBoost: ')
print(f"Best CatBoost Hyperparameters: {boosted_cb_grid_search.best_params_}")

Best Boosted Model - CatBoost: 
Best CatBoost Hyperparameters: {'boosted_cb_model__iterations': 50, 'boosted_cb_model__learning_rate': 0.01, 'boosted_cb_model__max_depth': 3, 'boosted_cb_model__num_leaves': 8}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {boosted_cb_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {boosted_cb_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, boosted_cb_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8259
F1 Score on Training Data: 0.8438

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       143
         1.0       0.81      0.89      0.84        61

    accuracy                           0.90       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, boosted_cb_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, boosted_cb_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal CatBoost Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal CatBoost Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {boosted_cb_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, boosted_cb_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, boosted_cb_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, boosted_cb_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal CatBoost Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal CatBoost Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
boosted_cb_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, boosted_cb_optimal.predict(X_preprocessed_train))
boosted_cb_optimal_train['model'] = ['boosted_cb_optimal'] * 5
boosted_cb_optimal_train['set'] = ['train'] * 5
print('Optimal CatBoost Train Performance Metrics: ')
display(boosted_cb_optimal_train)

Optimal CatBoost Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
boosted_cb_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, boosted_cb_optimal.predict(X_preprocessed_validation))
boosted_cb_optimal_validation['model'] = ['boosted_cb_optimal'] * 5
boosted_cb_optimal_validation['set'] = ['validation'] * 5
print('Optimal CatBoost Validation Performance Metrics: ')
display(boosted_cb_optimal_validation)

Optimal CatBoost Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(boosted_cb_optimal, 
            os.path.join("..", MODELS_PATH, "boosted_model_catboost_optimal.pkl"))

['..\\models\\boosted_model_catboost_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough',
    force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
stacked_baselearner_knn_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('stacked_baselearner_knn_model', KNeighborsClassifier())
])

##################################
# Defining hyperparameter grid
##################################
stacked_baselearner_knn_hyperparameter_grid = {
    'stacked_baselearner_knn_model__n_neighbors': [3, 5],
    'stacked_baselearner_knn_model__weights': ['uniform', 'distance'],
    'stacked_baselearner_knn_model__metric': ['minkowski', 'euclidean']
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
stacked_baselearner_knn_grid_search = GridSearchCV(
    estimator=stacked_baselearner_knn_pipeline,
    param_grid=stacked_baselearner_knn_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
stacked_baselearner_knn_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_knn_model',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_knn_model__metric': ['minkowski',
                                                                   'euclidean'],
                         'stacked_baselearner_knn_model__n_neighbors': [3, 5],
                         'stacked_baselearner_knn_model__weights': ['uniform',
                                                                    'distance']},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_knn_model',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_knn_model__metric': ['minkowski',
                                                                   'euclidean'],
                         'stacked_baselearner_knn_model__n_neighbors': [3, 5],
                         'stacked_baselearner_knn_model__weights': ['uniform',
                                                                    'distance']},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('stacked_baselearner_knn_model',
                 KNeighborsClassifier(n_neighbors=3))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

KNeighborsClassifier(n_neighbors=3)

##################################
# Identifying the best model
##################################
stacked_baselearner_knn_optimal = stacked_baselearner_knn_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_baselearner_knn_optimal_f1_cv = stacked_baselearner_knn_grid_search.best_score_
stacked_baselearner_knn_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_train))
stacked_baselearner_knn_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Base Learner KNN: ')
print(f"Best Stacked Base Learner KNN Hyperparameters: {stacked_baselearner_knn_grid_search.best_params_}")

Best Stacked Base Learner KNN: 
Best Stacked Base Learner KNN Hyperparameters: {'stacked_baselearner_knn_model__metric': 'minkowski', 'stacked_baselearner_knn_model__n_neighbors': 3, 'stacked_baselearner_knn_model__weights': 'uniform'}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_baselearner_knn_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_baselearner_knn_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.6417
F1 Score on Training Data: 0.8621

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       143
         1.0       0.91      0.82      0.86        61

    accuracy                           0.92       204
   macro avg       0.92      0.89      0.90       204
weighted avg       0.92      0.92      0.92       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner KNN Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner KNN Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {stacked_baselearner_knn_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.6486

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.85      0.90      0.87        49
         1.0       0.71      0.60      0.65        20

    accuracy                           0.81        69
   macro avg       0.78      0.75      0.76        69
weighted avg       0.81      0.81      0.81        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner KNN Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner KNN Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_baselearner_knn_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_train))
stacked_baselearner_knn_optimal_train['model'] = ['stacked_baselearner_knn_optimal'] * 5
stacked_baselearner_knn_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Base Learner KNN Train Performance Metrics: ')
display(stacked_baselearner_knn_optimal_train)

Optimal Stacked Base Learner KNN Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_baselearner_knn_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_baselearner_knn_optimal.predict(X_preprocessed_validation))
stacked_baselearner_knn_optimal_validation['model'] = ['stacked_baselearner_knn_optimal'] * 5
stacked_baselearner_knn_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Base Learner KNN Validation Performance Metrics: ')
display(stacked_baselearner_knn_optimal_validation)

Optimal Stacked Base Learner KNN Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(stacked_baselearner_knn_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_baselearner_knn_optimal.pkl"))

['..\\models\\stacked_model_baselearner_knn_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
stacked_baselearner_svm_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('stacked_baselearner_svm_model', SVC(class_weight='balanced',
                                          random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
stacked_baselearner_svm_hyperparameter_grid = {
    'stacked_baselearner_svm_model__C': [0.1, 1.0],
    'stacked_baselearner_svm_model__kernel': ['linear', 'rbf'],
    'stacked_baselearner_svm_model__gamma': ['scale','auto']
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
stacked_baselearner_svm_grid_search = GridSearchCV(
    estimator=stacked_baselearner_svm_pipeline,
    param_grid=stacked_baselearner_svm_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
stacked_baselearner_svm_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_svm_model',
                                        SVC(class_weight='balanced',
                                            random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_svm_model__C': [0.1, 1.0],
                         'stacked_baselearner_svm_model__gamma': ['scale',
                                                                  'auto'],
                         'stacked_baselearner_svm_model__kernel': ['linear',
                                                                   'rbf']},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_svm_model',
                                        SVC(class_weight='balanced',
                                            random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_svm_model__C': [0.1, 1.0],
                         'stacked_baselearner_svm_model__gamma': ['scale',
                                                                  'auto'],
                         'stacked_baselearner_svm_model__kernel': ['linear',
                                                                   'rbf']},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('stacked_baselearner_svm_model',
                 SVC(class_weight='balanced', kernel='linear',
                     random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

SVC(class_weight='balanced', kernel='linear', random_state=987654321)

##################################
# Identifying the best model
##################################
stacked_baselearner_svm_optimal = stacked_baselearner_svm_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_baselearner_svm_optimal_f1_cv = stacked_baselearner_svm_grid_search.best_score_
stacked_baselearner_svm_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_train))
stacked_baselearner_svm_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Base Learner SVM: ')
print(f"Best Stacked Base Learner SVM Hyperparameters: {stacked_baselearner_svm_grid_search.best_params_}")

Best Stacked Base Learner SVM: 
Best Stacked Base Learner SVM Hyperparameters: {'stacked_baselearner_svm_model__C': 1.0, 'stacked_baselearner_svm_model__gamma': 'scale', 'stacked_baselearner_svm_model__kernel': 'linear'}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_baselearner_svm_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_baselearner_svm_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8219
F1 Score on Training Data: 0.8438

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       143
         1.0       0.81      0.89      0.84        61

    accuracy                           0.90       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner SVM Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner SVM Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {stacked_baselearner_svm_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner SVM Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner SVM Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_baselearner_svm_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_train))
stacked_baselearner_svm_optimal_train['model'] = ['stacked_baselearner_svm_optimal'] * 5
stacked_baselearner_svm_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Base Learner SVM Train Performance Metrics: ')
display(stacked_baselearner_svm_optimal_train)

Optimal Stacked Base Learner SVM Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_baselearner_svm_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_baselearner_svm_optimal.predict(X_preprocessed_validation))
stacked_baselearner_svm_optimal_validation['model'] = ['stacked_baselearner_svm_optimal'] * 5
stacked_baselearner_svm_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Base Learner SVM Validation Performance Metrics: ')
display(stacked_baselearner_svm_optimal_validation)

Optimal Stacked Base Learner SVM Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(stacked_baselearner_svm_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_baselearner_svm_optimal.pkl"))

['..\\models\\stacked_model_baselearner_svm_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
stacked_baselearner_rc_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('stacked_baselearner_rc_model', RidgeClassifier(class_weight='balanced',
                                                     random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
stacked_baselearner_rc_hyperparameter_grid = {
    'stacked_baselearner_rc_model__alpha': [1.00, 2.00],
    'stacked_baselearner_rc_model__solver': ['sag', 'saga'],
    'stacked_baselearner_rc_model__tol': [1e-3, 1e-4]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
stacked_baselearner_rc_grid_search = GridSearchCV(
    estimator=stacked_baselearner_rc_pipeline,
    param_grid=stacked_baselearner_rc_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
stacked_baselearner_rc_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_rc_model',
                                        RidgeClassifier(class_weight='balanced',
                                                        random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_rc_model__alpha': [1.0, 2.0],
                         'stacked_baselearner_rc_model__solver': ['sag',
                                                                  'saga'],
                         'stacked_baselearner_rc_model__tol': [0.001, 0.0001]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_rc_model',
                                        RidgeClassifier(class_weight='balanced',
                                                        random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_rc_model__alpha': [1.0, 2.0],
                         'stacked_baselearner_rc_model__solver': ['sag',
                                                                  'saga'],
                         'stacked_baselearner_rc_model__tol': [0.001, 0.0001]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('stacked_baselearner_rc_model',
                 RidgeClassifier(alpha=2.0, class_weight='balanced',
                                 random_state=987654321, solver='saga'))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

RidgeClassifier(alpha=2.0, class_weight='balanced', random_state=987654321,
                solver='saga')

##################################
# Identifying the best model
##################################
stacked_baselearner_rc_optimal = stacked_baselearner_rc_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_baselearner_rc_optimal_f1_cv = stacked_baselearner_rc_grid_search.best_score_
stacked_baselearner_rc_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_train))
stacked_baselearner_rc_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Base Learner Ridge Classifier: ')
print(f"Best Stacked Base Learner Ridge Classifier Hyperparameters: {stacked_baselearner_rc_grid_search.best_params_}")

Best Stacked Base Learner Ridge Classifier: 
Best Stacked Base Learner Ridge Classifier Hyperparameters: {'stacked_baselearner_rc_model__alpha': 2.0, 'stacked_baselearner_rc_model__solver': 'saga', 'stacked_baselearner_rc_model__tol': 0.0001}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_baselearner_rc_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_baselearner_rc_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8097
F1 Score on Training Data: 0.8271

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.88      0.92       143
         1.0       0.76      0.90      0.83        61

    accuracy                           0.89       204
   macro avg       0.86      0.89      0.87       204
weighted avg       0.90      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Ridge Classifier Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Ridge Classifier Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {stacked_baselearner_rc_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8372

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        49
         1.0       0.78      0.90      0.84        20

    accuracy                           0.90        69
   macro avg       0.87      0.90      0.88        69
weighted avg       0.91      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Ridge Classifier Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Ridge Classifier Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_baselearner_rc_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_train))
stacked_baselearner_rc_optimal_train['model'] = ['stacked_baselearner_rc_optimal'] * 5
stacked_baselearner_rc_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Base Learner Ridge Classifier Train Performance Metrics: ')
display(stacked_baselearner_rc_optimal_train)

Optimal Stacked Base Learner Ridge Classifier Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_baselearner_rc_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_baselearner_rc_optimal.predict(X_preprocessed_validation))
stacked_baselearner_rc_optimal_validation['model'] = ['stacked_baselearner_rc_optimal'] * 5
stacked_baselearner_rc_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Base Learner Ridge Classifier Validation Performance Metrics: ')
display(stacked_baselearner_rc_optimal_validation)

Optimal Stacked Base Learner Ridge Classifier Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(stacked_baselearner_rc_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_baselearner_ridge_classifier_optimal.pkl"))

['..\\models\\stacked_model_baselearner_ridge_classifier_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
stacked_baselearner_nn_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('stacked_baselearner_nn_model', MLPClassifier(max_iter=500,
                                                   solver='lbfgs',
                                                   early_stopping=False,
                                                   random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
stacked_baselearner_nn_hyperparameter_grid = {
    'stacked_baselearner_nn_model__hidden_layer_sizes': [(50,), (100,)],
    'stacked_baselearner_nn_model__activation': ['relu', 'tanh'],
    'stacked_baselearner_nn_model__alpha': [0.0001, 0.001]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
stacked_baselearner_nn_grid_search = GridSearchCV(
    estimator=stacked_baselearner_nn_pipeline,
    param_grid=stacked_baselearner_nn_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
stacked_baselearner_nn_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_nn_model',
                                        MLPClassifier(max_iter=500,
                                                      random_state=987654321,
                                                      solver='lbfgs'))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_nn_model__activation': ['relu',
                                                                      'tanh'],
                         'stacked_baselearner_nn_model__alpha': [0.0001, 0.001],
                         'stacked_baselearner_nn_model__hidden_layer_sizes': [(50,),
                                                                              (100,)]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_nn_model',
                                        MLPClassifier(max_iter=500,
                                                      random_state=987654321,
                                                      solver='lbfgs'))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_nn_model__activation': ['relu',
                                                                      'tanh'],
                         'stacked_baselearner_nn_model__alpha': [0.0001, 0.001],
                         'stacked_baselearner_nn_model__hidden_layer_sizes': [(50,),
                                                                              (100,)]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('stacked_baselearner_nn_model',
                 MLPClassifier(hidden_layer_sizes=(50,), max_iter=500,
                               random_state=987654321, solver='lbfgs'))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=987654321,
              solver='lbfgs')

##################################
# Identifying the best model
##################################
stacked_baselearner_nn_optimal = stacked_baselearner_nn_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_baselearner_nn_optimal_f1_cv = stacked_baselearner_nn_grid_search.best_score_
stacked_baselearner_nn_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_train))
stacked_baselearner_nn_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Base Learner Neural Network: ')
print(f"Best Stacked Base Learner Neural Network Hyperparameters: {stacked_baselearner_nn_grid_search.best_params_}")

Best Stacked Base Learner Neural Network: 
Best Stacked Base Learner Neural Network Hyperparameters: {'stacked_baselearner_nn_model__activation': 'relu', 'stacked_baselearner_nn_model__alpha': 0.0001, 'stacked_baselearner_nn_model__hidden_layer_sizes': (50,)}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_baselearner_nn_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_baselearner_nn_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8063
F1 Score on Training Data: 0.8226

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.93      0.92      0.92       143
         1.0       0.81      0.84      0.82        61

    accuracy                           0.89       204
   macro avg       0.87      0.88      0.87       204
weighted avg       0.89      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Neural Network Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Neural Network Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {stacked_baselearner_nn_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8095

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.94      0.90      0.92        49
         1.0       0.77      0.85      0.81        20

    accuracy                           0.88        69
   macro avg       0.85      0.87      0.86        69
weighted avg       0.89      0.88      0.89        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Neural Network Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Neural Network Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_baselearner_nn_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_train))
stacked_baselearner_nn_optimal_train['model'] = ['stacked_baselearner_nn_optimal'] * 5
stacked_baselearner_nn_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Base Learner Neural Network Train Performance Metrics: ')
display(stacked_baselearner_nn_optimal_train)

Optimal Stacked Base Learner Neural Network Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_baselearner_nn_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_baselearner_nn_optimal.predict(X_preprocessed_validation))
stacked_baselearner_nn_optimal_validation['model'] = ['stacked_baselearner_nn_optimal'] * 5
stacked_baselearner_nn_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Base Learner Neural Network Validation Performance Metrics: ')
display(stacked_baselearner_nn_optimal_validation)

Optimal Stacked Base Learner Neural Network Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(stacked_baselearner_nn_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_baselearner_neural_network_optimal.pkl"))

['..\\models\\stacked_model_baselearner_neural_network_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
stacked_baselearner_dt_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('stacked_baselearner_dt_model', DecisionTreeClassifier(class_weight='balanced',
                                                            random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
stacked_baselearner_dt_hyperparameter_grid = {
    'stacked_baselearner_dt_model__criterion': ['gini', 'entropy'],
    'stacked_baselearner_dt_model__max_depth': [3, 6],
    'stacked_baselearner_dt_model__min_samples_leaf': [5, 10]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
stacked_baselearner_dt_grid_search = GridSearchCV(
    estimator=stacked_baselearner_dt_pipeline,
    param_grid=stacked_baselearner_dt_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
stacked_baselearner_dt_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_dt_model',
                                        DecisionTreeClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_dt_model__criterion': ['gini',
                                                                     'entropy'],
                         'stacked_baselearner_dt_model__max_depth': [3, 6],
                         'stacked_baselearner_dt_model__min_samples_leaf': [5,
                                                                            10]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('stacked_baselearner_dt_model',
                                        DecisionTreeClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'stacked_baselearner_dt_model__criterion': ['gini',
                                                                     'entropy'],
                         'stacked_baselearner_dt_model__max_depth': [3, 6],
                         'stacked_baselearner_dt_model__min_samples_leaf': [5,
                                                                            10]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('stacked_baselearner_dt_model',
                 DecisionTreeClassifier(class_weight='balanced', max_depth=6,
                                        min_samples_leaf=5,
                                        random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

DecisionTreeClassifier(class_weight='balanced', max_depth=6, min_samples_leaf=5,
                       random_state=987654321)

##################################
# Identifying the best model
##################################
stacked_baselearner_dt_optimal = stacked_baselearner_dt_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_baselearner_dt_optimal_f1_cv = stacked_baselearner_dt_grid_search.best_score_
stacked_baselearner_dt_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_train))
stacked_baselearner_dt_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Base Learner Decision Trees: ')
print(f"Best Stacked Base Learner Decision Trees Hyperparameters: {stacked_baselearner_dt_grid_search.best_params_}")

Best Stacked Base Learner Decision Trees: 
Best Stacked Base Learner Decision Trees Hyperparameters: {'stacked_baselearner_dt_model__criterion': 'gini', 'stacked_baselearner_dt_model__max_depth': 6, 'stacked_baselearner_dt_model__min_samples_leaf': 5}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_baselearner_dt_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_baselearner_dt_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8099
F1 Score on Training Data: 0.8511

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.99      0.86      0.92       143
         1.0       0.75      0.98      0.85        61

    accuracy                           0.90       204
   macro avg       0.87      0.92      0.89       204
weighted avg       0.92      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Decision Tree Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Decision Tree Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {stacked_baselearner_dt_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8000

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       1.00      0.80      0.89        49
         1.0       0.67      1.00      0.80        20

    accuracy                           0.86        69
   macro avg       0.83      0.90      0.84        69
weighted avg       0.90      0.86      0.86        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Base Learner Decision Tree Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Base Learner Decision Tree Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_baselearner_dt_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_train))
stacked_baselearner_dt_optimal_train['model'] = ['stacked_baselearner_dt_optimal'] * 5
stacked_baselearner_dt_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Base Learner Decision Tree Train Performance Metrics: ')
display(stacked_baselearner_dt_optimal_train)

Optimal Stacked Base Learner Decision Tree Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_baselearner_dt_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_baselearner_dt_optimal.predict(X_preprocessed_validation))
stacked_baselearner_dt_optimal_validation['model'] = ['stacked_baselearner_dt_optimal'] * 5
stacked_baselearner_dt_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Base Learner Decision Tree Validation Performance Metrics: ')
display(stacked_baselearner_dt_optimal_validation)

Optimal Stacked Base Learner Decision Tree Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(stacked_baselearner_dt_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_baselearner_decision_trees_optimal.pkl"))

['..\\models\\stacked_model_baselearner_decision_trees_optimal.pkl']

##################################
# Defining the stacking strategy (5-fold CV)
##################################
stacking_strategy = KFold(n_splits=5,
                          shuffle=True,
                          random_state=987654321)

##################################
# Loading the pre-trained base learners
# from the previously saved pickle files
##################################
stacked_baselearners = {}
stacked_baselearner_model = ['knn', 'svm', 'ridge_classifier', 'neural_network', 'decision_trees']
for name in stacked_baselearner_model:
    stacked_baselearner_model_path = (os.path.join("..", MODELS_PATH, f"stacked_model_baselearner_{name}_optimal.pkl"))
    stacked_baselearners[name] = joblib.load(stacked_baselearner_model_path)

##################################
# Initializing the meta-feature matrices
##################################
meta_train_stacked = np.zeros((X_preprocessed_train.shape[0], len(stacked_baselearners)))
meta_validation_stacked = np.zeros((X_preprocessed_validation.shape[0], len(stacked_baselearners)))

##################################
# Generating out-of-fold predictions for training the meta learner
##################################
for i, (name, model) in enumerate(stacked_baselearners.items()):
    oof_preds = np.zeros(X_preprocessed_train.shape[0])
    validation_fold_preds = np.zeros((X_preprocessed_validation.shape[0], stacking_strategy.get_n_splits()))

    for j, (train_idx, val_idx) in enumerate(stacking_strategy.split(X_preprocessed_train)):
        model.fit(X_preprocessed_train.iloc[train_idx], y_preprocessed_train_encoded[train_idx])
        oof_preds[val_idx] = model.predict_proba(X_preprocessed_train.iloc[val_idx])[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_train.iloc[val_idx])
        validation_fold_preds[:, j] = model.predict_proba(X_preprocessed_validation)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_validation)
        
    # Extracting the meta-feature matrix for the train data
    meta_train_stacked[:, i] = oof_preds
    # Extracting the meta-feature matrix for the validation data
    # Averaging the validation predictions across folds
    meta_validation_stacked[:, i] = validation_fold_preds.mean(axis=1)

##################################
# Training the meta learner on the stacked features
##################################
stacked_metalearner_lr_optimal = LogisticRegression(class_weight='balanced', 
                                            penalty='l2',
                                            C=1.0,
                                            solver='lbfgs',
                                            random_state=987654321)
stacked_metalearner_lr_optimal.fit(meta_train_stacked, y_preprocessed_train_encoded)

LogisticRegression(class_weight='balanced', random_state=987654321)

LogisticRegression(class_weight='balanced', random_state=987654321)

##################################
# Saving the meta learner model
# developed from the meta-train data
################################## 
joblib.dump(stacked_metalearner_lr_optimal, 
            os.path.join("..", MODELS_PATH, "stacked_model_metalearner_logistic_regression_optimal.pkl"))

['..\\models\\stacked_model_metalearner_logistic_regression_optimal.pkl']

##################################
# Creating a function to extract the 
# meta-feature matrices for new data
################################## 
def extract_stacked_metafeature_matrix(X_preprocessed_new):
    ##################################
    # Loading the pre-trained base learners
    # from the previously saved pickle files
    ##################################
    stacked_baselearners = {}
    stacked_baselearner_model = ['knn', 'svm', 'ridge_classifier', 'neural_network', 'decision_trees']
    for name in stacked_baselearner_model:
        stacked_baselearner_model_path = (os.path.join("..", MODELS_PATH, f"stacked_model_baselearner_{name}_optimal.pkl"))
        stacked_baselearners[name] = joblib.load(stacked_baselearner_model_path)

    ##################################
    # Generating meta-features for new data
    ##################################
    meta_train_stacked = np.zeros((X_preprocessed_train.shape[0], len(stacked_baselearners)))
    meta_new_stacked = np.zeros((X_preprocessed_new.shape[0], len(stacked_baselearners)))

    ##################################
    # Generating out-of-fold predictions for training the meta learner
    ##################################
    for i, (name, model) in enumerate(stacked_baselearners.items()):
        oof_preds = np.zeros(X_preprocessed_train.shape[0])
        new_fold_preds = np.zeros((X_preprocessed_new.shape[0], stacking_strategy.get_n_splits()))

        for j, (train_idx, val_idx) in enumerate(stacking_strategy.split(X_preprocessed_train)):
            model.fit(X_preprocessed_train.iloc[train_idx], y_preprocessed_train_encoded[train_idx])
            oof_preds[val_idx] = model.predict_proba(X_preprocessed_train.iloc[val_idx])[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_train.iloc[val_idx])
            new_fold_preds[:, j] = model.predict_proba(X_preprocessed_new)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_new)
        
        # Extracting the meta-feature matrix for the train data
        meta_train_stacked[:, i] = oof_preds
        # Extracting the meta-feature matrix for the new data
        # Averaging the new predictions across folds
        meta_new_stacked[:, i] = new_fold_preds.mean(axis=1)

    return meta_new_stacked

##################################
# Evaluating the F1 scores
# on the training and validation data
##################################
stacked_metalearner_lr_optimal_f1_train = f1_score(y_preprocessed_train_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_train)))
stacked_metalearner_lr_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_validation)))

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training data
# to assess overfitting optimism
##################################
print(f"F1 Score on Training Data: {stacked_metalearner_lr_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_train))))

F1 Score on Training Data: 0.8527

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.91      0.93       143
         1.0       0.81      0.90      0.85        61

    accuracy                           0.91       204
   macro avg       0.88      0.91      0.89       204
weighted avg       0.91      0.91      0.91       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_train)))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_train)), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Meta Learner Logistic Regression Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Meta Learner Logistic Regression Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validationing Data: {stacked_metalearner_lr_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_validation))))

F1 Score on Validationing Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_validation)))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_validation)), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Stacked Meta Learner Logistic Regression Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Stacked Meta Learner Logistic Regression Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
stacked_metalearner_lr_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_train)))
stacked_metalearner_lr_optimal_train['model'] = ['stacked_metalearner_lr_optimal'] * 5
stacked_metalearner_lr_optimal_train['set'] = ['train'] * 5
print('Optimal Stacked Meta Learner Logistic Regression Train Performance Metrics: ')
display(stacked_metalearner_lr_optimal_train)

Optimal Stacked Meta Learner Logistic Regression Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
stacked_metalearner_lr_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, stacked_metalearner_lr_optimal.predict(extract_stacked_metafeature_matrix(X_preprocessed_validation)))
stacked_metalearner_lr_optimal_validation['model'] = ['stacked_metalearner_lr_optimal'] * 5
stacked_metalearner_lr_optimal_validation['set'] = ['validation'] * 5
print('Optimal Stacked Meta Learner Logistic Regression Validation Performance Metrics: ')
display(stacked_metalearner_lr_optimal_validation)

Optimal Stacked Meta Learner Logistic Regression Validation Performance Metrics:

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough',
    force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
blended_baselearner_knn_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('blended_baselearner_knn_model', KNeighborsClassifier())
])

##################################
# Defining hyperparameter grid
##################################
blended_baselearner_knn_hyperparameter_grid = {
    'blended_baselearner_knn_model__n_neighbors': [3, 5],
    'blended_baselearner_knn_model__weights': ['uniform', 'distance'],
    'blended_baselearner_knn_model__metric': ['minkowski', 'euclidean']
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
blended_baselearner_knn_grid_search = GridSearchCV(
    estimator=blended_baselearner_knn_pipeline,
    param_grid=blended_baselearner_knn_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
blended_baselearner_knn_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_knn_model',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'blended_baselearner_knn_model__metric': ['minkowski',
                                                                   'euclidean'],
                         'blended_baselearner_knn_model__n_neighbors': [3, 5],
                         'blended_baselearner_knn_model__weights': ['uniform',
                                                                    'distance']},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_knn_model',
                                        KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'blended_baselearner_knn_model__metric': ['minkowski',
                                                                   'euclidean'],
                         'blended_baselearner_knn_model__n_neighbors': [3, 5],
                         'blended_baselearner_knn_model__weights': ['uniform',
                                                                    'distance']},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('blended_baselearner_knn_model',
                 KNeighborsClassifier(n_neighbors=3))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

KNeighborsClassifier(n_neighbors=3)

##################################
# Identifying the best model
##################################
blended_baselearner_knn_optimal = blended_baselearner_knn_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
blended_baselearner_knn_optimal_f1_cv = blended_baselearner_knn_grid_search.best_score_
blended_baselearner_knn_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_train))
blended_baselearner_knn_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Blended Base Learner KNN: ')
print(f"Best Blended Base Learner KNN Hyperparameters: {blended_baselearner_knn_grid_search.best_params_}")

Best Blended Base Learner KNN: 
Best Blended Base Learner KNN Hyperparameters: {'blended_baselearner_knn_model__metric': 'minkowski', 'blended_baselearner_knn_model__n_neighbors': 3, 'blended_baselearner_knn_model__weights': 'uniform'}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {blended_baselearner_knn_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {blended_baselearner_knn_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.6417
F1 Score on Training Data: 0.8621

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       143
         1.0       0.91      0.82      0.86        61

    accuracy                           0.92       204
   macro avg       0.92      0.89      0.90       204
weighted avg       0.92      0.92      0.92       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner KNN Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner KNN Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {blended_baselearner_knn_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.6486

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.85      0.90      0.87        49
         1.0       0.71      0.60      0.65        20

    accuracy                           0.81        69
   macro avg       0.78      0.75      0.76        69
weighted avg       0.81      0.81      0.81        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner KNN Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner KNN Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_baselearner_knn_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_train))
blended_baselearner_knn_optimal_train['model'] = ['blended_baselearner_knn_optimal'] * 5
blended_baselearner_knn_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Base Learner KNN Train Performance Metrics: ')
display(blended_baselearner_knn_optimal_train)

Optimal Blended Base Learner KNN Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_baselearner_knn_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_baselearner_knn_optimal.predict(X_preprocessed_validation))
blended_baselearner_knn_optimal_validation['model'] = ['blended_baselearner_knn_optimal'] * 5
blended_baselearner_knn_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Base Learner KNN Validation Performance Metrics: ')
display(blended_baselearner_knn_optimal_validation)

Optimal Blended Base Learner KNN Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(blended_baselearner_knn_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_baselearner_knn_optimal.pkl"))

['..\\models\\blended_model_baselearner_knn_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
blended_baselearner_svm_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('blended_baselearner_svm_model', SVC(class_weight='balanced',
                                          random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
blended_baselearner_svm_hyperparameter_grid = {
    'blended_baselearner_svm_model__C': [0.1, 1.0],
    'blended_baselearner_svm_model__kernel': ['linear', 'rbf'],
    'blended_baselearner_svm_model__gamma': ['scale','auto']
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
blended_baselearner_svm_grid_search = GridSearchCV(
    estimator=blended_baselearner_svm_pipeline,
    param_grid=blended_baselearner_svm_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
blended_baselearner_svm_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_svm_model',
                                        SVC(class_weight='balanced',
                                            random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_svm_model__C': [0.1, 1.0],
                         'blended_baselearner_svm_model__gamma': ['scale',
                                                                  'auto'],
                         'blended_baselearner_svm_model__kernel': ['linear',
                                                                   'rbf']},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_svm_model',
                                        SVC(class_weight='balanced',
                                            random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_svm_model__C': [0.1, 1.0],
                         'blended_baselearner_svm_model__gamma': ['scale',
                                                                  'auto'],
                         'blended_baselearner_svm_model__kernel': ['linear',
                                                                   'rbf']},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('blended_baselearner_svm_model',
                 SVC(class_weight='balanced', kernel='linear',
                     random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

SVC(class_weight='balanced', kernel='linear', random_state=987654321)

##################################
# Identifying the best model
##################################
blended_baselearner_svm_optimal = blended_baselearner_svm_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
blended_baselearner_svm_optimal_f1_cv = blended_baselearner_svm_grid_search.best_score_
blended_baselearner_svm_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_train))
blended_baselearner_svm_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Blended Base Learner SVM: ')
print(f"Best Blended Base Learner SVM Hyperparameters: {blended_baselearner_svm_grid_search.best_params_}")

Best Blended Base Learner SVM: 
Best Blended Base Learner SVM Hyperparameters: {'blended_baselearner_svm_model__C': 1.0, 'blended_baselearner_svm_model__gamma': 'scale', 'blended_baselearner_svm_model__kernel': 'linear'}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {blended_baselearner_svm_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {blended_baselearner_svm_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8219
F1 Score on Training Data: 0.8438

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       143
         1.0       0.81      0.89      0.84        61

    accuracy                           0.90       204
   macro avg       0.88      0.90      0.89       204
weighted avg       0.91      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner SVM Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner SVM Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {blended_baselearner_svm_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8571

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94        49
         1.0       0.82      0.90      0.86        20

    accuracy                           0.91        69
   macro avg       0.89      0.91      0.90        69
weighted avg       0.92      0.91      0.91        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner SVM Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner SVM Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_baselearner_svm_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_train))
blended_baselearner_svm_optimal_train['model'] = ['blended_baselearner_svm_optimal'] * 5
blended_baselearner_svm_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Base Learner SVM Train Performance Metrics: ')
display(blended_baselearner_svm_optimal_train)

Optimal Blended Base Learner SVM Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_baselearner_svm_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_baselearner_svm_optimal.predict(X_preprocessed_validation))
blended_baselearner_svm_optimal_validation['model'] = ['blended_baselearner_svm_optimal'] * 5
blended_baselearner_svm_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Base Learner SVM Validation Performance Metrics: ')
display(blended_baselearner_svm_optimal_validation)

Optimal Blended Base Learner SVM Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(blended_baselearner_svm_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_baselearner_svm_optimal.pkl"))

['..\\models\\blended_model_baselearner_svm_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
blended_baselearner_rc_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('blended_baselearner_rc_model', RidgeClassifier(class_weight='balanced',
                                                     random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
blended_baselearner_rc_hyperparameter_grid = {
    'blended_baselearner_rc_model__alpha': [1.00, 2.00],
    'blended_baselearner_rc_model__solver': ['sag', 'saga'],
    'blended_baselearner_rc_model__tol': [1e-3, 1e-4]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
blended_baselearner_rc_grid_search = GridSearchCV(
    estimator=blended_baselearner_rc_pipeline,
    param_grid=blended_baselearner_rc_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
blended_baselearner_rc_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_rc_model',
                                        RidgeClassifier(class_weight='balanced',
                                                        random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_rc_model__alpha': [1.0, 2.0],
                         'blended_baselearner_rc_model__solver': ['sag',
                                                                  'saga'],
                         'blended_baselearner_rc_model__tol': [0.001, 0.0001]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_rc_model',
                                        RidgeClassifier(class_weight='balanced',
                                                        random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_rc_model__alpha': [1.0, 2.0],
                         'blended_baselearner_rc_model__solver': ['sag',
                                                                  'saga'],
                         'blended_baselearner_rc_model__tol': [0.001, 0.0001]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('blended_baselearner_rc_model',
                 RidgeClassifier(alpha=2.0, class_weight='balanced',
                                 random_state=987654321, solver='saga'))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

RidgeClassifier(alpha=2.0, class_weight='balanced', random_state=987654321,
                solver='saga')

##################################
# Identifying the best model
##################################
blended_baselearner_rc_optimal = blended_baselearner_rc_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
blended_baselearner_rc_optimal_f1_cv = blended_baselearner_rc_grid_search.best_score_
blended_baselearner_rc_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_train))
blended_baselearner_rc_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Blended Base Learner Ridge Classifier: ')
print(f"Best Blended Base Learner Ridge Classifier Hyperparameters: {blended_baselearner_rc_grid_search.best_params_}")

Best Blended Base Learner Ridge Classifier: 
Best Blended Base Learner Ridge Classifier Hyperparameters: {'blended_baselearner_rc_model__alpha': 2.0, 'blended_baselearner_rc_model__solver': 'saga', 'blended_baselearner_rc_model__tol': 0.0001}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {blended_baselearner_rc_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {blended_baselearner_rc_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8097
F1 Score on Training Data: 0.8271

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.95      0.88      0.92       143
         1.0       0.76      0.90      0.83        61

    accuracy                           0.89       204
   macro avg       0.86      0.89      0.87       204
weighted avg       0.90      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Ridge Classifier Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Ridge Classifier Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {blended_baselearner_rc_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8372

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93        49
         1.0       0.78      0.90      0.84        20

    accuracy                           0.90        69
   macro avg       0.87      0.90      0.88        69
weighted avg       0.91      0.90      0.90        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Ridge Classifier Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Ridge Classifier Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_baselearner_rc_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_train))
blended_baselearner_rc_optimal_train['model'] = ['blended_baselearner_rc_optimal'] * 5
blended_baselearner_rc_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Base Learner Ridge Classifier Train Performance Metrics: ')
display(blended_baselearner_rc_optimal_train)

Optimal Blended Base Learner Ridge Classifier Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_baselearner_rc_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_baselearner_rc_optimal.predict(X_preprocessed_validation))
blended_baselearner_rc_optimal_validation['model'] = ['blended_baselearner_rc_optimal'] * 5
blended_baselearner_rc_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Base Learner Ridge Classifier Validation Performance Metrics: ')
display(blended_baselearner_rc_optimal_validation)

Optimal Blended Base Learner Ridge Classifier Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(blended_baselearner_rc_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_baselearner_ridge_classifier_optimal.pkl"))

['..\\models\\blended_model_baselearner_ridge_classifier_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
blended_baselearner_nn_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('blended_baselearner_nn_model', MLPClassifier(max_iter=500,
                                                   solver='lbfgs',
                                                   early_stopping=False,
                                                   random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
blended_baselearner_nn_hyperparameter_grid = {
    'blended_baselearner_nn_model__hidden_layer_sizes': [(50,), (100,)],
    'blended_baselearner_nn_model__activation': ['relu', 'tanh'],
    'blended_baselearner_nn_model__alpha': [0.0001, 0.001]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
blended_baselearner_nn_grid_search = GridSearchCV(
    estimator=blended_baselearner_nn_pipeline,
    param_grid=blended_baselearner_nn_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
blended_baselearner_nn_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_nn_model',
                                        MLPClassifier(max_iter=500,
                                                      random_state=987654321,
                                                      solver='lbfgs'))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_nn_model__activation': ['relu',
                                                                      'tanh'],
                         'blended_baselearner_nn_model__alpha': [0.0001, 0.001],
                         'blended_baselearner_nn_model__hidden_layer_sizes': [(50,),
                                                                              (100,)]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_nn_model',
                                        MLPClassifier(max_iter=500,
                                                      random_state=987654321,
                                                      solver='lbfgs'))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_nn_model__activation': ['relu',
                                                                      'tanh'],
                         'blended_baselearner_nn_model__alpha': [0.0001, 0.001],
                         'blended_baselearner_nn_model__hidden_layer_sizes': [(50,),
                                                                              (100,)]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('blended_baselearner_nn_model',
                 MLPClassifier(hidden_layer_sizes=(50,), max_iter=500,
                               random_state=987654321, solver='lbfgs'))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=987654321,
              solver='lbfgs')

##################################
# Identifying the best model
##################################
blended_baselearner_nn_optimal = blended_baselearner_nn_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
blended_baselearner_nn_optimal_f1_cv = blended_baselearner_nn_grid_search.best_score_
blended_baselearner_nn_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_train))
blended_baselearner_nn_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Blended Base Learner Neural Network: ')
print(f"Best Blended Base Learner Neural Network Hyperparameters: {blended_baselearner_nn_grid_search.best_params_}")

Best Blended Base Learner Neural Network: 
Best Blended Base Learner Neural Network Hyperparameters: {'blended_baselearner_nn_model__activation': 'relu', 'blended_baselearner_nn_model__alpha': 0.0001, 'blended_baselearner_nn_model__hidden_layer_sizes': (50,)}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {blended_baselearner_nn_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {blended_baselearner_nn_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8063
F1 Score on Training Data: 0.8226

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.93      0.92      0.92       143
         1.0       0.81      0.84      0.82        61

    accuracy                           0.89       204
   macro avg       0.87      0.88      0.87       204
weighted avg       0.89      0.89      0.89       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Neural Network Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Neural Network Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {blended_baselearner_nn_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8095

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.94      0.90      0.92        49
         1.0       0.77      0.85      0.81        20

    accuracy                           0.88        69
   macro avg       0.85      0.87      0.86        69
weighted avg       0.89      0.88      0.89        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Neural Network Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Neural Network Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_baselearner_nn_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_train))
blended_baselearner_nn_optimal_train['model'] = ['blended_baselearner_nn_optimal'] * 5
blended_baselearner_nn_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Base Learner Neural Network Train Performance Metrics: ')
display(blended_baselearner_nn_optimal_train)

Optimal Blended Base Learner Neural Network Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_baselearner_nn_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_baselearner_nn_optimal.predict(X_preprocessed_validation))
blended_baselearner_nn_optimal_validation['model'] = ['blended_baselearner_nn_optimal'] * 5
blended_baselearner_nn_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Base Learner Neural Network Validation Performance Metrics: ')
display(blended_baselearner_nn_optimal_validation)

Optimal Blended Base Learner Neural Network Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(blended_baselearner_nn_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_baselearner_neural_network_optimal.pkl"))

['..\\models\\blended_model_baselearner_neural_network_optimal.pkl']

##################################
# Defining the categorical preprocessing parameters
##################################
categorical_features = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response']
categorical_transformer = OrdinalEncoder()
categorical_preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features)],
                                             remainder='passthrough',
                                             force_int_remainder_cols=False)

##################################
# Defining the preprocessing and modeling pipeline parameters
##################################
blended_baselearner_dt_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('blended_baselearner_dt_model', DecisionTreeClassifier(class_weight='balanced',
                                                            random_state=987654321))
])

##################################
# Defining hyperparameter grid
##################################
blended_baselearner_dt_hyperparameter_grid = {
    'blended_baselearner_dt_model__criterion': ['gini', 'entropy'],
    'blended_baselearner_dt_model__max_depth': [3, 6],
    'blended_baselearner_dt_model__min_samples_leaf': [5, 10]
}

##################################
# Defining the cross-validation strategy (5-cycle 5-fold CV)
##################################
cv_strategy = RepeatedStratifiedKFold(n_splits=5, 
                                      n_repeats=5, 
                                      random_state=987654321)

##################################
# Performing Grid Search with cross-validation
##################################
blended_baselearner_dt_grid_search = GridSearchCV(
    estimator=blended_baselearner_dt_pipeline,
    param_grid=blended_baselearner_dt_hyperparameter_grid,
    scoring='f1',
    cv=cv_strategy,
    n_jobs=-1,
    verbose=1
)

##################################
# Encoding the response variables
# for model evaluation
##################################
y_encoder = OrdinalEncoder()
y_encoder.fit(y_preprocessed_train.values.reshape(-1, 1))
y_preprocessed_train_encoded = y_encoder.transform(y_preprocessed_train.values.reshape(-1, 1)).ravel()
y_preprocessed_validation_encoded = y_encoder.transform(y_preprocessed_validation.values.reshape(-1, 1)).ravel()

##################################
# Fitting GridSearchCV
##################################
blended_baselearner_dt_grid_search.fit(X_preprocessed_train, y_preprocessed_train_encoded)

Fitting 25 folds for each of 8 candidates, totalling 200 fits

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_dt_model',
                                        DecisionTreeClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_dt_model__criterion': ['gini',
                                                                     'entropy'],
                         'blended_baselearner_dt_model__max_depth': [3, 6],
                         'blended_baselearner_dt_model__min_samples_leaf': [5,
                                                                            10]},
             scoring='f1', verbose=1)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=987654321),
             estimator=Pipeline(steps=[('categorical_preprocessor',
                                        ColumnTransformer(force_int_remainder_cols=False,
                                                          remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OrdinalEncoder(),
                                                                         ['Gender',
                                                                          'Smoking',
                                                                          'Physical_Examination',
                                                                          'Adenopathy',
                                                                          'Focality',
                                                                          'Risk',
                                                                          'T',
                                                                          'Stage',
                                                                          'Response'])])),
                                       ('blended_baselearner_dt_model',
                                        DecisionTreeClassifier(class_weight='balanced',
                                                               random_state=987654321))]),
             n_jobs=-1,
             param_grid={'blended_baselearner_dt_model__criterion': ['gini',
                                                                     'entropy'],
                         'blended_baselearner_dt_model__max_depth': [3, 6],
                         'blended_baselearner_dt_model__min_samples_leaf': [5,
                                                                            10]},
             scoring='f1', verbose=1)

Pipeline(steps=[('categorical_preprocessor',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('cat', OrdinalEncoder(),
                                                  ['Gender', 'Smoking',
                                                   'Physical_Examination',
                                                   'Adenopathy', 'Focality',
                                                   'Risk', 'T', 'Stage',
                                                   'Response'])])),
                ('blended_baselearner_dt_model',
                 DecisionTreeClassifier(class_weight='balanced', max_depth=6,
                                        min_samples_leaf=5,
                                        random_state=987654321))])

ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                  transformers=[('cat', OrdinalEncoder(),
                                 ['Gender', 'Smoking', 'Physical_Examination',
                                  'Adenopathy', 'Focality', 'Risk', 'T',
                                  'Stage', 'Response'])])

['Gender', 'Smoking', 'Physical_Examination', 'Adenopathy', 'Focality', 'Risk', 'T', 'Stage', 'Response']

OrdinalEncoder()

['Age']

passthrough

DecisionTreeClassifier(class_weight='balanced', max_depth=6, min_samples_leaf=5,
                       random_state=987654321)

##################################
# Identifying the best model
##################################
blended_baselearner_dt_optimal = blended_baselearner_dt_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
blended_baselearner_dt_optimal_f1_cv = blended_baselearner_dt_grid_search.best_score_
blended_baselearner_dt_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_train))
blended_baselearner_dt_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_validation))

##################################
# Identifying the optimal model
##################################
print('Best Blended Base Learner Decision Trees: ')
print(f"Best Blended Base Learner Decision Trees Hyperparameters: {blended_baselearner_dt_grid_search.best_params_}")

Best Blended Base Learner Decision Trees: 
Best Blended Base Learner Decision Trees Hyperparameters: {'blended_baselearner_dt_model__criterion': 'gini', 'blended_baselearner_dt_model__max_depth': 6, 'blended_baselearner_dt_model__min_samples_leaf': 5}

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {blended_baselearner_dt_optimal_f1_cv:.4f}")
print(f"F1 Score on Training Data: {blended_baselearner_dt_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_train)))

F1 Score on Cross-Validated Data: 0.8099
F1 Score on Training Data: 0.8511

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.99      0.86      0.92       143
         1.0       0.75      0.98      0.85        61

    accuracy                           0.90       204
   macro avg       0.87      0.92      0.89       204
weighted avg       0.92      0.90      0.90       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_train))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Decision Trees Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Decision Trees Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validation Data: {blended_baselearner_dt_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_validation)))

F1 Score on Validation Data: 0.8000

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       1.00      0.80      0.89        49
         1.0       0.67      1.00      0.80        20

    accuracy                           0.86        69
   macro avg       0.83      0.90      0.84        69
weighted avg       0.90      0.86      0.86        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_validation))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Base Learner Decision Trees Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Base Learner Decision Trees Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_baselearner_dt_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_train))
blended_baselearner_dt_optimal_train['model'] = ['blended_baselearner_dt_optimal'] * 5
blended_baselearner_dt_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Base Learner Decision Tree Train Performance Metrics: ')
display(blended_baselearner_dt_optimal_train)

Optimal Blended Base Learner Decision Tree Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_baselearner_dt_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_baselearner_dt_optimal.predict(X_preprocessed_validation))
blended_baselearner_dt_optimal_validation['model'] = ['blended_baselearner_dt_optimal'] * 5
blended_baselearner_dt_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Base Learner Decision Tree Validation Performance Metrics: ')
display(blended_baselearner_dt_optimal_validation)

Optimal Blended Base Learner Decision Tree Validation Performance Metrics:

##################################
# Saving the best individual model
# developed from the train data
################################## 
joblib.dump(blended_baselearner_dt_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_baselearner_decision_trees_optimal.pkl"))

['..\\models\\blended_model_baselearner_decision_trees_optimal.pkl']

##################################
# Defining the blending strategy (75-25 development-holdout split)
##################################
X_preprocessed_train_development, X_preprocessed_holdout, y_preprocessed_train_development, y_preprocessed_holdout = train_test_split(
    X_preprocessed_train, y_preprocessed_train_encoded, 
    test_size=0.25, 
    random_state=987654321
)

##################################
# Loading the pre-trained base learners
# from the previously saved pickle files
##################################
blended_baselearners = {}
blended_baselearner_model = ['knn', 'svm', 'ridge_classifier', 'neural_network', 'decision_trees']
for name in blended_baselearner_model:
    blended_baselearner_model_path = os.path.join("..", MODELS_PATH, f"blended_model_baselearner_{name}_optimal.pkl")
    blended_baselearners[name] = joblib.load(blended_baselearner_model_path)

##################################
# Initializing the meta-feature matrices
##################################
meta_train_blended = np.zeros((X_preprocessed_holdout.shape[0], len(blended_baselearners)))
meta_validation_blended = np.zeros((X_preprocessed_validation.shape[0], len(blended_baselearners)))

##################################
# Generating hold-out predictions for training the meta learner
##################################
for i, (name, model) in enumerate(blended_baselearners.items()):
    model.fit(X_preprocessed_train_development, y_preprocessed_train_development)  
    meta_train_blended[:, i] = model.predict_proba(X_preprocessed_holdout)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_holdout)
    meta_validation_blended[:, i] = model.predict_proba(X_preprocessed_validation)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_validation)

##################################
# Training the meta learner on the stacked features
##################################
blended_metalearner_lr_optimal = LogisticRegression(class_weight='balanced', 
                                                    penalty='l2', 
                                                    C=1.0, 
                                                    solver='lbfgs', 
                                                    random_state=987654321)
blended_metalearner_lr_optimal.fit(meta_train_blended, y_preprocessed_holdout)

LogisticRegression(class_weight='balanced', random_state=987654321)

LogisticRegression(class_weight='balanced', random_state=987654321)

##################################
# Saving the meta learner model
# developed from the meta-train data
################################## 
joblib.dump(blended_metalearner_lr_optimal, 
            os.path.join("..", MODELS_PATH, "blended_model_metalearner_logistic_regression_optimal.pkl"))

['..\\models\\blended_model_metalearner_logistic_regression_optimal.pkl']

##################################
# Creating a function to extract the 
# meta-feature matrices for new data
################################## 
def extract_blended_metafeature_matrix(X_preprocessed_new):
    ##################################
    # Loading the pre-trained base learners
    # from the previously saved pickle files
    ##################################
    blended_baselearners = {}
    blended_baselearner_model = ['knn', 'svm', 'ridge_classifier', 'neural_network', 'decision_trees']
    for name in blended_baselearner_model:
        blended_baselearner_model_path = (os.path.join("..", MODELS_PATH, f"blended_model_baselearner_{name}_optimal.pkl"))
        blended_baselearners[name] = joblib.load(blended_baselearner_model_path)

    ##################################
    # Generating meta-features for new data
    ##################################
    meta_train_blended = np.zeros((X_preprocessed_holdout.shape[0], len(blended_baselearners)))
    meta_new_blended = np.zeros((X_preprocessed_new.shape[0], len(blended_baselearners)))

    ##################################
    # Generating holdout predictions
    # from the base learners
    ##################################
    for i, (name, model) in enumerate(blended_baselearners.items()):
        model.fit(X_preprocessed_train_development, y_preprocessed_train_development) 
        meta_train_blended[:, i] = model.predict_proba(X_preprocessed_holdout)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_holdout)
        meta_new_blended[:, i] = model.predict_proba(X_preprocessed_new)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_preprocessed_new)

    return meta_new_blended

##################################
# Evaluating the F1 scores
# on the training and validation data
##################################
blended_metalearner_lr_optimal_f1_train = f1_score(y_preprocessed_train_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_train)))
blended_metalearner_lr_optimal_f1_validation = f1_score(y_preprocessed_validation_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_validation)))

##################################
# Summarizing the F1 score results
# and classification metrics
# on the training data
# to assess overfitting optimism
##################################
print(f"F1 Score on Training Data: {blended_metalearner_lr_optimal_f1_train:.4f}")
print("\nClassification Report on Train Data:\n", classification_report(y_preprocessed_train_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_train))))

F1 Score on Training Data: 0.8550

Classification Report on Train Data:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93       143
         1.0       0.80      0.92      0.85        61

    accuracy                           0.91       204
   macro avg       0.88      0.91      0.89       204
weighted avg       0.91      0.91      0.91       204

##################################
# Formulating the raw and normalized
# confusion matrices
# from the train data
##################################
cm_raw = confusion_matrix(y_preprocessed_train_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_train)))
cm_normalized = confusion_matrix(y_preprocessed_train_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_train)), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Meta Learner Logistic Regression Train Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Meta Learner Logistic Regression Train Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
# to assess overfitting optimism
##################################
print(f"F1 Score on Validationing Data: {blended_metalearner_lr_optimal_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_preprocessed_validation_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_validation))))

F1 Score on Validationing Data: 0.8837

Classification Report on Validation Data:
               precision    recall  f1-score   support

         0.0       0.98      0.92      0.95        49
         1.0       0.83      0.95      0.88        20

    accuracy                           0.93        69
   macro avg       0.90      0.93      0.92        69
weighted avg       0.93      0.93      0.93        69

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_preprocessed_validation_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_validation)))
cm_normalized = confusion_matrix(y_preprocessed_validation_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_validation)), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Raw Confusion Matrix: Optimal Blended Meta Learner Logistic Regression Validation Performance', fontsize=11)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Normalized Confusion Matrix: Optimal Blended Meta Learner Logistic Regression Validation Performance', fontsize=11)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Gathering the model evaluation metrics
# for the train data
##################################
blended_metalearner_lr_optimal_train = model_performance_evaluation(y_preprocessed_train_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_train)))
blended_metalearner_lr_optimal_train['model'] = ['blended_metalearner_lr_optimal'] * 5
blended_metalearner_lr_optimal_train['set'] = ['train'] * 5
print('Optimal Blended Meta Learner Logistic Regression Train Performance Metrics: ')
display(blended_metalearner_lr_optimal_train)

Optimal Blended Meta Learner Logistic Regression Train Performance Metrics:

##################################
# Gathering the model evaluation metrics
# for the validation data
##################################
blended_metalearner_lr_optimal_validation = model_performance_evaluation(y_preprocessed_validation_encoded, blended_metalearner_lr_optimal.predict(extract_blended_metafeature_matrix(X_preprocessed_validation)))
blended_metalearner_lr_optimal_validation['model'] = ['blended_metalearner_lr_optimal'] * 5
blended_metalearner_lr_optimal_validation['set'] = ['validation'] * 5
print('Optimal Blended Meta Learner Logistic Regression Validation Performance Metrics: ')
display(blended_metalearner_lr_optimal_validation)

Optimal Blended Meta Learner Logistic Regression Validation Performance Metrics:

##################################
# Consolidating all the
# bagged, boosted, stacked and blended
# model performance measures
# for the train and validation data
##################################
ensemble_train_validation_all_performance = pd.concat([bagged_rf_optimal_train,
                                             bagged_rf_optimal_validation,
                                             bagged_et_optimal_train,
                                             bagged_et_optimal_validation,
                                             bagged_bdt_optimal_train,
                                             bagged_bdt_optimal_validation,
                                             bagged_blr_optimal_train,
                                             bagged_blr_optimal_validation,
                                             bagged_bsvm_optimal_train,
                                             bagged_bsvm_optimal_validation,
                                             boosted_ab_optimal_train,
                                             boosted_ab_optimal_validation,
                                             boosted_gb_optimal_train,
                                             boosted_gb_optimal_validation,
                                             boosted_xgb_optimal_train,
                                             boosted_xgb_optimal_validation,
                                             boosted_lgbm_optimal_train,
                                             boosted_lgbm_optimal_validation,
                                             boosted_cb_optimal_train,
                                             boosted_cb_optimal_validation,
                                             stacked_baselearner_knn_optimal_train, 
                                             stacked_baselearner_knn_optimal_validation,
                                             stacked_baselearner_svm_optimal_train, 
                                             stacked_baselearner_svm_optimal_validation,
                                             stacked_baselearner_rc_optimal_train, 
                                             stacked_baselearner_rc_optimal_validation,
                                             stacked_baselearner_nn_optimal_train, 
                                             stacked_baselearner_nn_optimal_validation,
                                             stacked_baselearner_dt_optimal_train, 
                                             stacked_baselearner_dt_optimal_validation,
                                             stacked_metalearner_lr_optimal_train, 
                                             stacked_metalearner_lr_optimal_validation,
                                             blended_baselearner_knn_optimal_train, 
                                             blended_baselearner_knn_optimal_validation,
                                             blended_baselearner_svm_optimal_train, 
                                             blended_baselearner_svm_optimal_validation,
                                             blended_baselearner_rc_optimal_train, 
                                             blended_baselearner_rc_optimal_validation,
                                             blended_baselearner_nn_optimal_train, 
                                             blended_baselearner_nn_optimal_validation,
                                             blended_baselearner_dt_optimal_train, 
                                             blended_baselearner_dt_optimal_validation,
                                             blended_metalearner_lr_optimal_train, 
                                             blended_metalearner_lr_optimal_validation], 
                                            ignore_index=True)
print('Consolidated Ensemble Model Performance on Train and Validation Data: ')
display(ensemble_train_validation_all_performance)

Consolidated Ensemble Model Performance on Train and Validation Data:

##################################
# Consolidating all the F1 score
# model performance measures
# between the train and validation data
##################################
ensemble_train_validation_all_performance_F1 = ensemble_train_validation_all_performance[ensemble_train_validation_all_performance['metric_name']=='F1']
ensemble_train_validation_all_performance_F1_train = ensemble_train_validation_all_performance_F1[ensemble_train_validation_all_performance_F1['set']=='train'].loc[:,"metric_value"]
ensemble_train_validation_all_performance_F1_validation = ensemble_train_validation_all_performance_F1[ensemble_train_validation_all_performance_F1['set']=='validation'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between the train and validation data
##################################
ensemble_train_validation_all_performance_F1_plot = pd.DataFrame({'train': ensemble_train_validation_all_performance_F1_train.values,
                                                              'validation': ensemble_train_validation_all_performance_F1_validation.values},
                                                             index=ensemble_train_validation_all_performance_F1['model'].unique())
ensemble_train_validation_all_performance_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between the train and validation sets
##################################
ensemble_train_validation_all_performance_F1_plot = ensemble_train_validation_all_performance_F1_plot.plot.barh(figsize=(10, 20), width=0.9)
ensemble_train_validation_all_performance_F1_plot.set_xlim(0.00,1.00)
ensemble_train_validation_all_performance_F1_plot.set_title("Model Comparison by F1 Score Performance on Train and Validation Data")
ensemble_train_validation_all_performance_F1_plot.set_xlabel("F1 Score Performance")
ensemble_train_validation_all_performance_F1_plot.set_ylabel("Ensemble Model")
ensemble_train_validation_all_performance_F1_plot.grid(False)
ensemble_train_validation_all_performance_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in ensemble_train_validation_all_performance_F1_plot.containers:
    ensemble_train_validation_all_performance_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Consolidating all the final
# bagged, boosted, stacked and blended
# model performance measures
# for the train and validation data
##################################
ensemble_train_validation_performance = ensemble_train_validation_all_performance[
    ~ensemble_train_validation_all_performance['model'].str.contains('baselearner', case=False, na=False)
]
print('Consolidated Final Ensemble Model Performance on Train and Validation Data: ')
display(ensemble_train_validation_performance)

Consolidated Final Ensemble Model Performance on Train and Validation Data:

##################################
# Consolidating all the F1 score
# model performance measures
# between the train and validation data
##################################
ensemble_train_validation_performance_F1 = ensemble_train_validation_performance[ensemble_train_validation_performance['metric_name']=='F1']
ensemble_train_validation_performance_F1_train = ensemble_train_validation_performance_F1[ensemble_train_validation_performance_F1['set']=='train'].loc[:,"metric_value"]
ensemble_train_validation_performance_F1_validation = ensemble_train_validation_performance_F1[ensemble_train_validation_performance_F1['set']=='validation'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between the train and validation data
##################################
ensemble_train_validation_performance_F1_plot = pd.DataFrame({'train': ensemble_train_validation_performance_F1_train.values,
                                                              'validation': ensemble_train_validation_performance_F1_validation.values},
                                                             index=ensemble_train_validation_performance_F1['model'].unique())
ensemble_train_validation_performance_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between the train and validation sets
##################################
ensemble_train_validation_performance_F1_plot = ensemble_train_validation_performance_F1_plot.plot.barh(figsize=(10, 10), width=0.9)
ensemble_train_validation_performance_F1_plot.set_xlim(0.00,1.00)
ensemble_train_validation_performance_F1_plot.set_title("Model Comparison by F1 Score Performance on Train and Validation Data")
ensemble_train_validation_performance_F1_plot.set_xlabel("F1 Score Performance")
ensemble_train_validation_performance_F1_plot.set_ylabel("Ensemble Model")
ensemble_train_validation_performance_F1_plot.grid(False)
ensemble_train_validation_performance_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in ensemble_train_validation_performance_F1_plot.containers:
    ensemble_train_validation_performance_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Gathering all model performance measures
# for the validation data
##################################
ensemble_train_validation_performance_Accuracy_validation = ensemble_train_validation_performance[(ensemble_train_validation_performance['set']=='validation') & (ensemble_train_validation_performance['metric_name']=='Accuracy')].loc[:,"metric_value"]
ensemble_train_validation_performance_Precision_validation = ensemble_train_validation_performance[(ensemble_train_validation_performance['set']=='validation') & (ensemble_train_validation_performance['metric_name']=='Precision')].loc[:,"metric_value"]
ensemble_train_validation_performance_Recall_validation = ensemble_train_validation_performance[(ensemble_train_validation_performance['set']=='validation') & (ensemble_train_validation_performance['metric_name']=='Recall')].loc[:,"metric_value"]
ensemble_train_validation_performance_F1_validation = ensemble_train_validation_performance[(ensemble_train_validation_performance['set']=='validation') & (ensemble_train_validation_performance['metric_name']=='F1')].loc[:,"metric_value"]
ensemble_train_validation_performance_AUROC_validation = ensemble_train_validation_performance[(ensemble_train_validation_performance['set']=='validation') & (ensemble_train_validation_performance['metric_name']=='AUROC')].loc[:,"metric_value"]

##################################
# Combining all the model performance measures
# for the validation data
##################################
ensemble_train_validation_performance_all_plot_validation = pd.DataFrame({'accuracy': ensemble_train_validation_performance_Accuracy_validation.values,
                                                                    'precision': ensemble_train_validation_performance_Precision_validation.values,
                                                                    'recall': ensemble_train_validation_performance_Recall_validation.values,
                                                                    'f1': ensemble_train_validation_performance_F1_validation.values,
                                                                    'auroc': ensemble_train_validation_performance_AUROC_validation.values},
                                                                   index=ensemble_train_validation_performance['model'].unique())
ensemble_train_validation_performance_all_plot_validation

##################################
# Gathering the model evaluation metrics
# for the test data
##################################
##################################
# Defining a dictionary of models and 
# their corresponding feature extraction functions
##################################
models = {
    'bagged_rf_optimal': bagged_rf_optimal,
    'bagged_et_optimal': bagged_et_optimal,
    'bagged_bdt_optimal': bagged_bdt_optimal,
    'bagged_blr_optimal': bagged_blr_optimal,
    'bagged_bsvm_optimal': bagged_bsvm_optimal,
    'boosted_ab_optimal': boosted_ab_optimal,
    'boosted_gb_optimal': boosted_gb_optimal,
    'boosted_xgb_optimal': boosted_xgb_optimal,
    'boosted_lgbm_optimal': boosted_lgbm_optimal,
    'boosted_cb_optimal': boosted_cb_optimal,
    'stacked_baselearner_knn_optimal': stacked_baselearner_knn_optimal,
    'stacked_baselearner_svm_optimal': stacked_baselearner_svm_optimal,
    'stacked_baselearner_rc_optimal': stacked_baselearner_rc_optimal,
    'stacked_baselearner_nn_optimal': stacked_baselearner_nn_optimal,
    'stacked_baselearner_dt_optimal': stacked_baselearner_dt_optimal,
    'stacked_metalearner_lr_optimal': stacked_metalearner_lr_optimal,
    'blended_baselearner_knn_optimal': blended_baselearner_knn_optimal,
    'blended_baselearner_svm_optimal': blended_baselearner_svm_optimal,
    'blended_baselearner_rc_optimal': blended_baselearner_rc_optimal,
    'blended_baselearner_nn_optimal': blended_baselearner_nn_optimal,
    'blended_baselearner_dt_optimal': blended_baselearner_dt_optimal,
    'blended_metalearner_lr_optimal': blended_metalearner_lr_optimal
}

##################################
# Defining transformation functions for meta-learners
##################################
feature_extractors = {
    'stacked_metalearner_lr_optimal': extract_stacked_metafeature_matrix,
    'blended_metalearner_lr_optimal': extract_blended_metafeature_matrix
}

##################################
# Encoding the response variables
# for the test data
##################################
y_preprocessed_test_encoded = y_encoder.transform(y_preprocessed_test.values.reshape(-1, 1)).ravel()

##################################
# Storing the model evaluation metrics
# for the test data
##################################
ensemble_test_all_performance = []

##################################
# Looping through each model 
# and evaluate performance on test data
##################################
for model_name, model in models.items():
    # Applying transformation if needed (for meta-learner)
    X_input = feature_extractors.get(model_name, lambda x: x)(X_preprocessed_test)
    
    # Evaluating performance
    ensemble_test_all_performance_results = model_performance_evaluation(y_preprocessed_test_encoded, model.predict(X_input))
    
    # Adding metadata columns
    ensemble_test_all_performance_results['model'] = model_name
    ensemble_test_all_performance_results['set'] = 'test'
    
    # Storing result
    ensemble_test_all_performance.append(ensemble_test_all_performance_results)

##################################
# Consolidating all model performance measures
# for the test data
##################################
ensemble_test_all_performance = pd.concat(ensemble_test_all_performance, ignore_index=True)
print('Consolidated Ensemble Model Performance on Test Data: ')
display(ensemble_test_all_performance)

Consolidated Ensemble Model Performance on Test Data:

##################################
# Consolidating all the final
# bagged, boosted, stacked and blended
# model performance measures
# for the test data
##################################
ensemble_test_performance = ensemble_test_all_performance[
    ~ensemble_test_all_performance['model'].str.contains('baselearner', case=False, na=False)
]
print('Consolidated Final Ensemble Model Performance on Test Data: ')
display(ensemble_test_performance)

Consolidated Final Ensemble Model Performance on Test Data:

##################################
# Gathering all model performance measures
# for the test data
##################################
ensemble_test_performance_Accuracy_test = ensemble_test_performance[(ensemble_test_performance['set']=='test') & (ensemble_test_performance['metric_name']=='Accuracy')].loc[:,"metric_value"]
ensemble_test_performance_Precision_test = ensemble_test_performance[(ensemble_test_performance['set']=='test') & (ensemble_test_performance['metric_name']=='Precision')].loc[:,"metric_value"]
ensemble_test_performance_Recall_test = ensemble_test_performance[(ensemble_test_performance['set']=='test') & (ensemble_test_performance['metric_name']=='Recall')].loc[:,"metric_value"]
ensemble_test_performance_F1_test = ensemble_test_performance[(ensemble_test_performance['set']=='test') & (ensemble_test_performance['metric_name']=='F1')].loc[:,"metric_value"]
ensemble_test_performance_AUROC_test = ensemble_test_performance[(ensemble_test_performance['set']=='test') & (ensemble_test_performance['metric_name']=='AUROC')].loc[:,"metric_value"]

##################################
# Combining all the model performance measures
# for the test data
##################################
ensemble_test_performance_all_plot_test = pd.DataFrame({'accuracy': ensemble_test_performance_Accuracy_test.values,
                                                                    'precision': ensemble_test_performance_Precision_test.values,
                                                                    'recall': ensemble_test_performance_Recall_test.values,
                                                                    'f1': ensemble_test_performance_F1_test.values,
                                                                    'auroc': ensemble_test_performance_AUROC_test.values},
                                                                   index=ensemble_test_performance['model'].unique())
ensemble_test_performance_all_plot_test

##################################
# Consolidating all the final
# bagged, boosted, stacked and blended
# model performance measures
# for the train, validation and test data
##################################
ensemble_overall_performance = pd.concat([ensemble_train_validation_performance, ensemble_test_performance], axis=0)

##################################
# Consolidating all the F1 score
# model performance measures
# between the train, validation and test data
##################################
ensemble_overall_performance_F1 = ensemble_overall_performance[ensemble_overall_performance['metric_name']=='F1']
ensemble_overall_performance_F1_train = ensemble_overall_performance_F1[ensemble_overall_performance_F1['set']=='train'].loc[:,"metric_value"]
ensemble_overall_performance_F1_validation = ensemble_overall_performance_F1[ensemble_overall_performance_F1['set']=='validation'].loc[:,"metric_value"]
ensemble_overall_performance_F1_test = ensemble_overall_performance_F1[ensemble_overall_performance_F1['set']=='test'].loc[:,"metric_value"]

##################################
# Combining all the F1 score
# model performance measures
# between the train and validation data
##################################
ensemble_overall_performance_F1_plot = pd.DataFrame({'train': ensemble_overall_performance_F1_train.values,
                                                     'validation': ensemble_overall_performance_F1_validation.values,
                                                     'test': ensemble_overall_performance_F1_test.values},
                                                    index=ensemble_overall_performance_F1['model'].unique())
ensemble_overall_performance_F1_plot

##################################
# Plotting all the F1 score
# model performance measures
# between train, validation and test sets
##################################
ensemble_overall_performance_F1_plot = ensemble_overall_performance_F1_plot.plot.barh(figsize=(10, 10), width=0.9)
ensemble_overall_performance_F1_plot.set_xlim(0.00,1.00)
ensemble_overall_performance_F1_plot.set_title("Model Comparison by F1 Score Performance on Train, Validation and Test Data")
ensemble_overall_performance_F1_plot.set_xlabel("F1 Score Performance")
ensemble_overall_performance_F1_plot.set_ylabel("Ensemble Model")
ensemble_overall_performance_F1_plot.grid(False)
ensemble_overall_performance_F1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in ensemble_overall_performance_F1_plot.containers:
    ensemble_overall_performance_F1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Computing the permutation importance
# for the final model determined as the blended model
# with a Logistic Regression meta learner comprised of the 
# KNN, SVM, Ridge Classifier, Neural Network and Decision Tree base learners
##################################
base_learner_names = ['KNN', 'SVM', 'Ridge Classifier', 'Neural Network', 'Decision Tree']
perm_importance = permutation_importance(
    blended_metalearner_lr_optimal,  # Meta Learner
    meta_validation_blended,         # Meta Features (Base Learner Predictions)
    y_preprocessed_validation_encoded,       # True Labels
    n_repeats=10, 
    random_state=42
)

# Obtaining the sorted indices in descending order
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

# Plotting the feature importance
plt.figure(figsize=(17, 5))
plt.bar(range(len(perm_importance.importances_mean)), perm_importance.importances_mean[sorted_idx], align='center')
plt.xticks(range(len(perm_importance.importances_mean)), np.array(base_learner_names)[sorted_idx], rotation=90)
plt.xlabel("Base Learner")
plt.ylabel("Permutation Importance Score")
plt.title("Permutation Importance: Blended Model (Meta Learner: Logistic Regression, Base Learners: KNN, SVM, Ridge Classifier, Neural Network, Decision Tree)")
plt.show()

##################################
# Creating a function to compute the permutation importance
# for the KNN, SVM, Ridge Classifier, Neural Network and Decision Tree base learners
##################################
feature_names = ['Gender','Smoking','Physical_Examination','Adenopathy','Focality','Risk','T','Stage','Response','Age']
def compute_permutation_importance(model, X_evaluation, y_evaluation, model_name="Model", feature_names=feature_names, n_repeats=10, random_state=42):
    # Computing permutation importance
    perm_importance = permutation_importance(model, X_evaluation, y_evaluation, n_repeats=n_repeats, random_state=random_state)

    # Getting the sorted indices (descending order)
    sorted_idx = perm_importance.importances_mean.argsort()[::-1]

    # Using feature names if provided, else using column indices
    if feature_names is None:
        feature_names = [f"Feature {i}" for i in range(X_evaluation.shape[1])]

    # Plotting feature importance
    plt.figure(figsize=(17, 5))
    plt.bar(range(len(perm_importance.importances_mean)), perm_importance.importances_mean[sorted_idx], align='center')
    plt.xticks(range(len(perm_importance.importances_mean)), np.array(feature_names)[sorted_idx], rotation=90)
    plt.xlabel("Feature")
    plt.ylabel("Permutation Importance Score")
    plt.title(f"Feature Importance (Permutation): {model_name}")
    plt.show()

    return perm_importance

##################################
# Computing the permutation importance
# for the Ridge Classifier base learner
##################################
perm_importance_blended_baselearner_rc_optimal = compute_permutation_importance(blended_baselearner_rc_optimal, 
                                                                                X_preprocessed_train, 
                                                                                y_preprocessed_train_encoded, 
                                                                                "Optimal Blended Base Learner Ridge Classifier",
                                                                                feature_names=feature_names)

##################################
# Computing the permutation importance
# for the Ridge Classifier base learner
##################################
perm_importance_blended_baselearner_svm_optimal = compute_permutation_importance(blended_baselearner_svm_optimal, 
                                                                                 X_preprocessed_train, 
                                                                                 y_preprocessed_train_encoded, 
                                                                                 "Optimal Blended Base Learner SVM",
                                                                                 feature_names=feature_names)

##################################
# Computing the permutation importance
# for the Decision Tree base learner
##################################
perm_importance_blended_baselearner_dt_optimal = compute_permutation_importance(blended_baselearner_dt_optimal, 
                                                                                X_preprocessed_train, 
                                                                                y_preprocessed_train_encoded, 
                                                                                "Optimal Blended Base Learner Decision Tree",
                                                                                feature_names=feature_names)

##################################
# Computing the permutation importance
# for the KNN base learner
##################################
perm_importance_blended_baselearner_knn_optimal = compute_permutation_importance(blended_baselearner_knn_optimal, 
                                                                                 X_preprocessed_train, 
                                                                                 y_preprocessed_train_encoded, 
                                                                                 "Optimal Blended Base Learner KNN",
                                                                                 feature_names=feature_names)

##################################
# Computing the permutation importance
# for the Neural Network base learner
##################################
perm_importance_blended_baselearner_nn_optimal = compute_permutation_importance(blended_baselearner_nn_optimal, 
                                                                                X_preprocessed_train, 
                                                                                y_preprocessed_train_encoded, 
                                                                                "Optimal Blended Base Learner Neural Network",
                                                                                feature_names=feature_names)

from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))

	Age	Gender	Smoking	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	Stage	Response
Age	1.000000	-0.185530	0.299971	0.077845	0.012021	0.073931	-0.215274	0.195272	0.205360	0.246838	0.013195	0.528144	0.317978
Gender	-0.185530	1.000000	-0.604101	-0.093290	-0.031935	-0.158480	0.127817	-0.218103	-0.255507	-0.215101	-0.178550	-0.219727	-0.179431
Smoking	0.299971	-0.604101	1.000000	0.064124	0.004339	0.192350	-0.338086	0.182212	0.233024	0.231679	0.105463	0.327952	0.215362
Thyroid_Function	0.077845	-0.093290	0.064124	1.000000	0.019964	-0.137486	-0.049893	0.051564	-0.012519	-0.042960	-0.043275	0.080702	-0.036498
Physical_Examination	0.012021	-0.031935	0.004339	0.019964	1.000000	0.063246	0.018806	0.245779	0.166012	0.086039	0.104553	0.054799	0.116526
Adenopathy	0.073931	-0.158480	0.192350	-0.137486	0.063246	1.000000	0.047117	0.288750	0.673638	0.421762	0.805406	0.278749	0.518887
Pathology	-0.215274	0.127817	-0.338086	-0.049893	0.018806	0.047117	1.000000	-0.126299	-0.117392	-0.286899	0.157869	-0.187683	-0.154637
Focality	0.195272	-0.218103	0.182212	0.051564	0.245779	0.288750	-0.126299	1.000000	0.454926	0.518864	0.307716	0.372331	0.388741
Risk	0.205360	-0.255507	0.233024	-0.012519	0.166012	0.673638	-0.117392	0.454926	1.000000	0.622459	0.726304	0.533264	0.631330
T	0.246838	-0.215101	0.231679	-0.042960	0.086039	0.421762	-0.286899	0.518864	0.622459	1.000000	0.368430	0.468168	0.556742
N	0.013195	-0.178550	0.105463	-0.043275	0.104553	0.805406	0.157869	0.307716	0.726304	0.368430	1.000000	0.310156	0.542672
Stage	0.528144	-0.219727	0.327952	0.080702	0.054799	0.278749	-0.187683	0.372331	0.533264	0.468168	0.310156	1.000000	0.417025
Response	0.317978	-0.179431	0.215362	-0.036498	0.116526	0.518887	-0.154637	0.388741	0.631330	0.556742	0.542672	0.417025	1.000000

	ChiSquare.Test.Statistic	ChiSquare.Test.PValue
Recurred_Risk	98.599608	3.090804e-23
Recurred_Response	90.866461	1.537030e-21
Recurred_Adenopathy	73.585561	9.636704e-18
Recurred_N	73.176134	1.185810e-17
Recurred_T	62.205367	3.094435e-15
Recurred_Stage	44.963917	2.006987e-11
Recurred_Focality	32.859398	9.907099e-09
Recurred_Gender	17.787641	2.469824e-05
Recurred_Smoking	14.460357	1.431406e-04
Recurred_Physical_Examination	2.413115	1.203227e-01
Recurred_Thyroid_Function	0.966826	3.254729e-01
Recurred_Pathology	0.131614	7.167646e-01

	Age	Gender	Smoking	Hx_Smoking	Hx_Radiotherapy	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	M	Stage	Response	Recurred
0	27	F	No	No	No	Euthyroid	Single nodular goiter-left	No	Micropapillary	Uni-Focal	Low	T1a	N0	M0	I	Indeterminate	No
1	34	F	No	Yes	No	Euthyroid	Multinodular goiter	No	Micropapillary	Uni-Focal	Low	T1a	N0	M0	I	Excellent	No
2	30	F	No	No	No	Euthyroid	Single nodular goiter-right	No	Micropapillary	Uni-Focal	Low	T1a	N0	M0	I	Excellent	No
3	62	F	No	No	No	Euthyroid	Single nodular goiter-right	No	Micropapillary	Uni-Focal	Low	T1a	N0	M0	I	Excellent	No
4	62	F	No	No	No	Euthyroid	Multinodular goiter	No	Micropapillary	Multi-Focal	Low	T1a	N0	M0	I	Excellent	No

	count	unique	top	freq
Gender	383	2	F	312
Smoking	383	2	No	334
Hx_Smoking	383	2	No	355
Hx_Radiotherapy	383	2	No	376
Thyroid_Function	383	5	Euthyroid	332
Physical_Examination	383	5	Single nodular goiter-right	140
Adenopathy	383	6	No	277
Pathology	383	4	Papillary	287
Focality	383	2	Uni-Focal	247
Risk	383	3	Low	249
T	383	7	T2	151
N	383	3	N0	268
M	383	2	M0	365
Stage	383	5	I	333
Response	383	4	Excellent	208
Recurred	383	2	No	275

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Fill.Rate
0	Age	int64	364	364	1.0
1	Gender	category	364	364	1.0
2	Smoking	category	364	364	1.0
3	Hx_Smoking	category	364	364	1.0
4	Hx_Radiotherapy	category	364	364	1.0
5	Thyroid_Function	category	364	364	1.0
6	Physical_Examination	category	364	364	1.0
7	Adenopathy	category	364	364	1.0
8	Pathology	category	364	364	1.0
9	Focality	category	364	364	1.0
10	Risk	category	364	364	1.0
11	T	category	364	364	1.0
12	N	category	364	364	1.0
13	M	category	364	364	1.0
14	Stage	category	364	364	1.0
15	Response	category	364	364	1.0
16	Recurred	category	364	364	1.0

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	17	0	0.0
1	1	17	0	0.0
2	2	17	0	0.0
3	3	17	0	0.0
4	4	17	0	0.0
...	...	...	...	...
359	378	17	0	0.0
360	379	17	0	0.0
361	380	17	0	0.0
362	381	17	0	0.0
363	382	17	0	0.0

	Categorical.Column.Name	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio
0	Gender	F	M	293	71	4.126761	2	364	0.005495
1	Smoking	No	Yes	315	49	6.428571	2	364	0.005495
2	Hx_Smoking	No	Yes	336	28	12.000000	2	364	0.005495
3	Hx_Radiotherapy	No	Yes	357	7	51.000000	2	364	0.005495
4	Thyroid_Function	Euthyroid	Clinical Hyperthyroidism	313	20	15.650000	5	364	0.013736
5	Physical_Examination	Multinodular goiter	Single nodular goiter-right	135	127	1.062992	5	364	0.013736
6	Adenopathy	No	Right	258	48	5.375000	6	364	0.016484
7	Pathology	Papillary	Micropapillary	271	45	6.022222	4	364	0.010989
8	Focality	Uni-Focal	Multi-Focal	228	136	1.676471	2	364	0.005495
9	Risk	Low	Intermediate	230	102	2.254902	3	364	0.008242
10	T	T2	T3a	138	96	1.437500	7	364	0.019231
11	N	N0	N1b	249	93	2.677419	3	364	0.008242
12	M	M0	M1	346	18	19.222222	2	364	0.005495
13	Stage	I	II	314	32	9.812500	5	364	0.013736
14	Response	Excellent	Structural Incomplete	189	91	2.076923	4	364	0.010989
15	Recurred	No	Yes	256	108	2.370370	2	364	0.005495

	Age	Gender	Smoking	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	Stage	Response	Recurred
140	28	F	No	Euthyroid	Multinodular or Diffuse Goiter	No	Papillary	Uni-Focal	Low	T1 to T2	N0	I	Excellent	No
205	36	F	No	Euthyroid	Normal or Single Nodular Goiter	Yes	Papillary	Uni-Focal	Low	T1 to T2	N1	I	Indeterminate or Incomplete	No
277	41	M	Yes	Euthyroid	Normal or Single Nodular Goiter	No	Non-Papillary	Multi-Focal	Intermediate to High	T3 to T4b	N0	I	Excellent	No
294	42	M	No	Hypothyroidism or Hyperthyroidism	Normal or Single Nodular Goiter	No	Papillary	Multi-Focal	Intermediate to High	T3 to T4b	N1	I	Indeterminate or Incomplete	No
268	32	F	No	Euthyroid	Normal or Single Nodular Goiter	No	Papillary	Uni-Focal	Low	T3 to T4b	N0	I	Excellent	No

	Age	Gender	Smoking	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	Stage	Response
140	28	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
205	36	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0
277	41	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0
294	42	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0
268	32	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
300	67	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0
115	37	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
67	51	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
161	22	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
55	21	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0

	metric_name	metric_value	model	set
0	Accuracy	0.892157	bagged_rf_optimal	train
1	Precision	0.774648	bagged_rf_optimal	train
2	Recall	0.901639	bagged_rf_optimal	train
3	F1	0.833333	bagged_rf_optimal	train
4	AUROC	0.894876	bagged_rf_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.898551	bagged_rf_optimal	validation
1	Precision	0.782609	bagged_rf_optimal	validation
2	Recall	0.900000	bagged_rf_optimal	validation
3	F1	0.837209	bagged_rf_optimal	validation
4	AUROC	0.898980	bagged_rf_optimal	validation

	metric_name	metric_value	model	set
0	Accuracy	0.892157	bagged_et_optimal	train
1	Precision	0.774648	bagged_et_optimal	train
2	Recall	0.901639	bagged_et_optimal	train
3	F1	0.833333	bagged_et_optimal	train
4	AUROC	0.894876	bagged_et_optimal	train

	Recurred	Count	Percentage
0	No	256	70.32967
1	Yes	108	29.67033

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	17	0	0.0
1	1	17	0	0.0
2	2	17	0	0.0
3	3	17	0	0.0
4	4	17	0	0.0
...	...	...	...	...
359	378	17	0	0.0
360	379	17	0	0.0
361	380	17	0	0.0
362	381	17	0	0.0
363	382	17	0	0.0

	Age	Gender	Smoking	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	Stage	Response
140	28	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
205	36	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0
277	41	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0
294	42	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0
268	32	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
300	67	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0
115	37	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
67	51	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
161	22	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
55	21	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0

	metric_name	metric_value	model	set
0	Accuracy	0.901961	bagged_bdt_optimal	train
1	Precision	0.797101	bagged_bdt_optimal	train
2	Recall	0.901639	bagged_bdt_optimal	train
3	F1	0.846154	bagged_bdt_optimal	train
4	AUROC	0.901869	bagged_bdt_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.913043	bagged_bdt_optimal	validation
1	Precision	0.818182	bagged_bdt_optimal	validation
2	Recall	0.900000	bagged_bdt_optimal	validation
3	F1	0.857143	bagged_bdt_optimal	validation
4	AUROC	0.909184	bagged_bdt_optimal	validation

	metric_name	metric_value	model	set
0	Accuracy	0.892157	bagged_blr_optimal	train
1	Precision	0.774648	bagged_blr_optimal	train
2	Recall	0.901639	bagged_blr_optimal	train
3	F1	0.833333	bagged_blr_optimal	train
4	AUROC	0.894876	bagged_blr_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.906863	bagged_bsvm_optimal	train
1	Precision	0.808824	bagged_bsvm_optimal	train
2	Recall	0.901639	bagged_bsvm_optimal	train
3	F1	0.852713	bagged_bsvm_optimal	train
4	AUROC	0.905365	bagged_bsvm_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.901961	boosted_ab_optimal	train
1	Precision	0.805970	boosted_ab_optimal	train
2	Recall	0.885246	boosted_ab_optimal	train
3	F1	0.843750	boosted_ab_optimal	train
4	AUROC	0.897168	boosted_ab_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.946078	boosted_gb_optimal	train
1	Precision	0.903226	boosted_gb_optimal	train
2	Recall	0.918033	boosted_gb_optimal	train
3	F1	0.910569	boosted_gb_optimal	train
4	AUROC	0.938037	boosted_gb_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.906863	boosted_xgb_optimal	train
1	Precision	0.818182	boosted_xgb_optimal	train
2	Recall	0.885246	boosted_xgb_optimal	train
3	F1	0.850394	boosted_xgb_optimal	train
4	AUROC	0.900665	boosted_xgb_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.936275	boosted_lgbm_optimal	train
1	Precision	0.887097	boosted_lgbm_optimal	train
2	Recall	0.901639	boosted_lgbm_optimal	train
3	F1	0.894309	boosted_lgbm_optimal	train
4	AUROC	0.926344	boosted_lgbm_optimal	train

Supervised Learning : Leveraging Ensemble Learning With Bagging, Boosting, Stacking and Blending Approaches¶

John Pauline Pineda March 12, 2025

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Splitting ¶

1.4.2 Data Profiling ¶

1.4.3 Category Aggregration and Encoding ¶

1.4.4 Outlier and Distributional Shape Analysis ¶

1.4.5 Collinearity ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Premodelling Data Preparation ¶

1.6.1 Preprocessed Data Description¶

1.6.2 Preprocessing Pipeline Development¶

1.7. Bagged Model Development ¶

1.7.1 Random Forest ¶

1.7.2 Extra Trees ¶

1.7.3 Bagged Decision Trees ¶

1.7.4 Bagged Logistic Regression ¶

1.7.5 Bagged Support Vector Machine ¶

1.8. Boosted Model Development ¶

1.8.1 AdaBoost ¶

1.8.2 Gradient Boosting ¶

1.8.3 XGBoost ¶

1.8.4 Light GBM ¶

1.8.5 CatBoost ¶

1.9. Stacked Model Development ¶

1.9.1 Base Learner - K-Nearest Neighbors ¶

1.9.2 Base Learner - Support Vector Machine ¶

1.9.3 Base Learner - Ridge Classifier ¶

1.9.4 Base Learner - Neural Network ¶

1.9.5 Base Learner - Decision Tree ¶

1.9.6 Meta Learner - Logistic Regression ¶

1.10. Blended Model Development ¶

1.10.1 Base Learner - K-Nearest Neighbors ¶

1.10.2 Base Learner - Support Vector Machine ¶

1.10.3 Base Learner - Ridge Classifier ¶

1.10.4 Base Learner - Neural Network ¶

1.10.5 Base Learner - Decision Tree ¶

1.10.6 Meta Learner - Logistic Regression ¶

1.11. Consolidated Summary¶

2. Summary ¶

3. References ¶

John Pauline Pineda

March 12, 2025

1.6.1 Preprocessed Data Description ¶

1.6.2 Preprocessing Pipeline Development ¶

1.11. Consolidated Summary ¶

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	17	0	0.0
1	1	17	0	0.0
2	2	17	0	0.0
3	3	17	0	0.0
4	4	17	0	0.0
...	...	...	...	...
359	378	17	0	0.0
360	379	17	0	0.0
361	380	17	0	0.0
362	381	17	0	0.0
363	382	17	0	0.0

	Age	Gender	Smoking	Thyroid_Function	Physical_Examination	Adenopathy	Pathology	Focality	Risk	T	N	Stage	Response
140	28	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
205	36	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0
277	41	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0
294	42	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0
268	32	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
300	67	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0
115	37	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
67	51	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
161	22	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
55	21	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0

	metric_name	metric_value	model	set
0	Accuracy	0.901961	boosted_cb_optimal	train
1	Precision	0.805970	boosted_cb_optimal	train
2	Recall	0.885246	boosted_cb_optimal	train
3	F1	0.843750	boosted_cb_optimal	train
4	AUROC	0.897168	boosted_cb_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.921569	stacked_baselearner_knn_optimal	train
1	Precision	0.909091	stacked_baselearner_knn_optimal	train
2	Recall	0.819672	stacked_baselearner_knn_optimal	train
3	F1	0.862069	stacked_baselearner_knn_optimal	train
4	AUROC	0.892354	stacked_baselearner_knn_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.811594	stacked_baselearner_knn_optimal	validation
1	Precision	0.705882	stacked_baselearner_knn_optimal	validation
2	Recall	0.600000	stacked_baselearner_knn_optimal	validation
3	F1	0.648649	stacked_baselearner_knn_optimal	validation
4	AUROC	0.748980	stacked_baselearner_knn_optimal	validation

	metric_name	metric_value	model	set
0	Accuracy	0.901961	stacked_baselearner_svm_optimal	train
1	Precision	0.805970	stacked_baselearner_svm_optimal	train
2	Recall	0.885246	stacked_baselearner_svm_optimal	train
3	F1	0.843750	stacked_baselearner_svm_optimal	train
4	AUROC	0.897168	stacked_baselearner_svm_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.887255	stacked_baselearner_rc_optimal	train
1	Precision	0.763889	stacked_baselearner_rc_optimal	train
2	Recall	0.901639	stacked_baselearner_rc_optimal	train
3	F1	0.827068	stacked_baselearner_rc_optimal	train
4	AUROC	0.891379	stacked_baselearner_rc_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.892157	stacked_baselearner_nn_optimal	train
1	Precision	0.809524	stacked_baselearner_nn_optimal	train
2	Recall	0.836066	stacked_baselearner_nn_optimal	train
3	F1	0.822581	stacked_baselearner_nn_optimal	train
4	AUROC	0.876075	stacked_baselearner_nn_optimal	train

	metric_name	metric_value	model	set
0	Accuracy	0.884058	stacked_baselearner_nn_optimal	validation
1	Precision	0.772727	stacked_baselearner_nn_optimal	validation
2	Recall	0.850000	stacked_baselearner_nn_optimal	validation
3	F1	0.809524	stacked_baselearner_nn_optimal	validation
4	AUROC	0.873980	stacked_baselearner_nn_optimal	validation

	metric_name	metric_value	model	set
0	Accuracy	0.897059	stacked_baselearner_dt_optimal	train
1	Precision	0.750000	stacked_baselearner_dt_optimal	train
2	Recall	0.983607	stacked_baselearner_dt_optimal	train
3	F1	0.851064	stacked_baselearner_dt_optimal	train
4	AUROC	0.921873	stacked_baselearner_dt_optimal	train