##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import itertools
import joblib
%matplotlib inline

from operator import add,mul,truediv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler
from scipy import stats
from scipy.stats import pointbiserialr

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"
DATASETS_PREPROCESSED_PATH = r"datasets\preprocessed"
DATASETS_FINAL_PATH = r"datasets\final\complete"
DATASETS_FINAL_TRAIN_PATH = r"datasets\final\train"
DATASETS_FINAL_TRAIN_FEATURES_PATH = r"datasets\final\train\features"
DATASETS_FINAL_TRAIN_TARGET_PATH = r"datasets\final\train\target"
DATASETS_FINAL_VALIDATION_PATH = r"datasets\final\validation"
DATASETS_FINAL_VALIDATION_FEATURES_PATH = r"datasets\final\validation\features"
DATASETS_FINAL_VALIDATION_TARGET_PATH = r"datasets\final\validation\target"
DATASETS_FINAL_TEST_PATH = r"datasets\final\test"
DATASETS_FINAL_TEST_FEATURES_PATH = r"datasets\final\test\features"
DATASETS_FINAL_TEST_TARGET_PATH = r"datasets\final\test\target"
MODELS_PATH = r"models"

##################################
# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
lung_cancer = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "lung_cancer.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(lung_cancer.shape)

Dataset Dimensions:

(309, 16)

##################################
# Verifying the column names
##################################
print('Column Names: ')
display(lung_cancer.columns)

Column Names:

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

##################################
# Removing trailing white spaces
# in column names
##################################
lung_cancer.columns = [x.strip() for x in lung_cancer.columns]

##################################
# Standardizing the column names
##################################
lung_cancer.columns = ['GENDER', 
                       'AGE', 
                       'SMOKING', 
                       'YELLOW_FINGERS', 
                       'ANXIETY',
                       'PEER_PRESSURE', 
                       'CHRONIC_DISEASE', 
                       'FATIGUE', 
                       'ALLERGY', 
                       'WHEEZING',
                       'ALCOHOL_CONSUMING', 
                       'COUGHING', 
                       'SHORTNESS_OF_BREATH',
                       'SWALLOWING_DIFFICULTY', 
                       'CHEST_PAIN', 
                       'LUNG_CANCER']

##################################
# Verifying the corrected column names
##################################
print('Column Names: ')
display(lung_cancer.columns)

Column Names:

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
      dtype='object')

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(lung_cancer.dtypes)

Column Names and Data Types:

GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC_DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL_CONSUMING         int64
COUGHING                  int64
SHORTNESS_OF_BREATH       int64
SWALLOWING_DIFFICULTY     int64
CHEST_PAIN                int64
LUNG_CANCER              object
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
lung_cancer.head()

##################################
# Setting the levels of the dichotomous categorical variables
# to boolean values
##################################
lung_cancer[['GENDER','LUNG_CANCER']] = lung_cancer[['GENDER','LUNG_CANCER']].astype('category')
lung_cancer['GENDER'] = lung_cancer['GENDER'].cat.set_categories(['F', 'M'], ordered=True)
lung_cancer['LUNG_CANCER'] = lung_cancer['LUNG_CANCER'].cat.set_categories(['NO', 'YES'], ordered=True)
int_columns = ['SMOKING',
               'YELLOW_FINGERS', 
               'ANXIETY',
               'PEER_PRESSURE', 
               'CHRONIC_DISEASE', 
               'FATIGUE', 
               'ALLERGY', 
               'WHEEZING',
               'ALCOHOL_CONSUMING', 
               'COUGHING', 
               'SHORTNESS_OF_BREATH',
               'SWALLOWING_DIFFICULTY', 
               'CHEST_PAIN', 
               'LUNG_CANCER']
lung_cancer[int_columns] = lung_cancer[int_columns].astype(object)
lung_cancer[int_columns] = lung_cancer[int_columns].replace({1: 'Absent', 2: 'Present'})

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(lung_cancer.dtypes)

Column Names and Data Types:

GENDER                   category
AGE                         int64
SMOKING                    object
YELLOW_FINGERS             object
ANXIETY                    object
PEER_PRESSURE              object
CHRONIC_DISEASE            object
FATIGUE                    object
ALLERGY                    object
WHEEZING                   object
ALCOHOL_CONSUMING          object
COUGHING                   object
SHORTNESS_OF_BREATH        object
SWALLOWING_DIFFICULTY      object
CHEST_PAIN                 object
LUNG_CANCER                object
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
lung_cancer.head()

##################################
# Performing a general exploration 
# of the numeric variables
##################################
print('Numeric Variable Summary:')
display(lung_cancer.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Performing a general exploration 
# of the object and categorical variables
##################################
print('Categorical Variable Summary:')
display(lung_cancer.describe(include=['category','object']).transpose())

Categorical Variable Summary:

##################################
# Counting the number of duplicated rows
##################################
lung_cancer.duplicated().sum()

np.int64(33)

##################################
# Displaying the duplicated rows
##################################
lung_cancer[lung_cancer.duplicated()]

##################################
# Gathering the data types for each column
##################################
data_type_list = list(lung_cancer.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(lung_cancer.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(lung_cancer)] * len(lung_cancer.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(lung_cancer.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(lung_cancer.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
print('Number of Columns with Missing Data:', str(len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])))

Number of Columns with Missing Data: 0

##################################
# Identifying the rows
# with Fill.Rate < 1.00
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1.00)]

##################################
# Gathering the metadata labels for each observation
##################################
row_metadata_list = lung_cancer.index.values.tolist()

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(lung_cancer.columns)] * len(lung_cancer))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(lung_cancer.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Exploring the rows
# for missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_metadata_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Fill.Rate < 1.00
##################################
print('Number of Rows with Missing Data:',str(len(all_row_quality_summary[all_row_quality_summary['Missing.Rate']>0])))

Number of Rows with Missing Data: 0

##################################
# Formulating the dataset
# with numeric columns only
##################################
lung_cancer_numeric = lung_cancer.select_dtypes(include=['number','int'])

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = lung_cancer_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = lung_cancer_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = lung_cancer_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = lung_cancer_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = lung_cancer_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [lung_cancer[x].value_counts(dropna=True).index.tolist()[0] for x in lung_cancer_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [lung_cancer[x].value_counts(dropna=True).index.tolist()[1] for x in lung_cancer_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [lung_cancer_numeric[x].isin([lung_cancer[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in lung_cancer_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [lung_cancer_numeric[x].isin([lung_cancer[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in lung_cancer_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = lung_cancer_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(lung_cancer_numeric)] * len(lung_cancer_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = lung_cancer_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = lung_cancer_numeric.kurtosis()

numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

0

##################################
# Formulating the dataset
# with object or categorical column only
##################################
lung_cancer_object = lung_cancer.select_dtypes(include=['object','category'])

##################################
# Gathering the variable names for the object or categorical column
##################################
categorical_variable_name_list = lung_cancer_object.columns

##################################
# Gathering the first mode values for the object or categorical column
##################################
categorical_first_mode_list = [lung_cancer[x].value_counts().index.tolist()[0] for x in lung_cancer_object]

##################################
# Gathering the second mode values for each object or categorical column
##################################
categorical_second_mode_list = [lung_cancer[x].value_counts().index.tolist()[1] for x in lung_cancer_object]

##################################
# Gathering the count of first mode values for each object or categorical column
##################################
categorical_first_mode_count_list = [lung_cancer_object[x].isin([lung_cancer[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in lung_cancer_object]

##################################
# Gathering the count of second mode values for each object or categorical column
##################################
categorical_second_mode_count_list = [lung_cancer_object[x].isin([lung_cancer[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in lung_cancer_object]

##################################
# Gathering the first mode to second mode ratio for each object or categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each object or categorical column
##################################
categorical_unique_count_list = lung_cancer_object.nunique(dropna=True)

##################################
# Gathering the number of observations for each object or categorical column
##################################
categorical_row_count_list = list([len(lung_cancer_object)] * len(lung_cancer_object.columns))

##################################
# Gathering the unique to count ratio for each object or categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                 categorical_first_mode_list,
                                                 categorical_second_mode_list,
                                                 categorical_first_mode_count_list,
                                                 categorical_second_mode_count_list,
                                                 categorical_first_second_mode_ratio_list,
                                                 categorical_unique_count_list,
                                                 categorical_row_count_list,
                                                 categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of object or categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

1

##################################
# Identifying the object or categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
display(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)].sort_values(by=['First.Second.Mode.Ratio'], ascending=False))

##################################
# Counting the number of object or categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Formulating the dataset
# with numeric columns only
##################################
lung_cancer_numeric = lung_cancer.select_dtypes(include=['number','int'])

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = lung_cancer_numeric.columns

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = lung_cancer_numeric.skew()

##################################
# Computing the interquartile range
# for all columns
##################################
lung_cancer_numeric_q1 = lung_cancer_numeric.quantile(0.25)
lung_cancer_numeric_q3 = lung_cancer_numeric.quantile(0.75)
lung_cancer_numeric_iqr = lung_cancer_numeric_q3 - lung_cancer_numeric_q1

##################################
# Gathering the outlier count for each numeric column
# based on the interquartile range criterion
##################################
numeric_outlier_count_list = ((lung_cancer_numeric < (lung_cancer_numeric_q1 - 1.5 * lung_cancer_numeric_iqr)) | (lung_cancer_numeric > (lung_cancer_numeric_q3 + 1.5 * lung_cancer_numeric_iqr))).sum()

##################################
# Gathering the number of observations for each column
##################################
numeric_row_count_list = list([len(lung_cancer_numeric)] * len(lung_cancer_numeric.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
numeric_outlier_ratio_list = map(truediv, numeric_outlier_count_list, numeric_row_count_list)

##################################
# Formulating the outlier summary
# for all numeric columns
##################################
numeric_column_outlier_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                  numeric_skewness_list,
                                                  numeric_outlier_count_list,
                                                  numeric_row_count_list,
                                                  numeric_outlier_ratio_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Skewness',
                                                 'Outlier.Count',
                                                 'Row.Count',
                                                 'Outlier.Ratio'])
display(numeric_column_outlier_summary)

##################################
# Formulating the individual boxplots
# for all numeric columns
##################################
for column in lung_cancer_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=lung_cancer_numeric, x=column)

##################################
# Creating a dataset copy and
# converting all values to numeric (integer or float)
# for correlation analysis
##################################
pd.set_option('future.no_silent_downcasting', True)
lung_cancer_correlation = lung_cancer.copy()
lung_cancer_correlation_object = lung_cancer_correlation.iloc[:,2:15].columns
lung_cancer_correlation[lung_cancer_correlation_object] = lung_cancer_correlation[lung_cancer_correlation_object].replace({'Absent': 0, 'Present': 1})
lung_cancer_correlation = lung_cancer_correlation.drop(['GENDER','LUNG_CANCER'], axis=1)
lung_cancer_correlation['AGE'] = lung_cancer_correlation['AGE'].astype(float)
object_cols_to_convert = lung_cancer_correlation.columns[1:]
for col in object_cols_to_convert:
    if lung_cancer_correlation[col].dtype == 'object':
        lung_cancer_correlation[col] = lung_cancer_correlation[col].astype(int)
display(lung_cancer_correlation)

##################################
# Initializing the correlation matrix
##################################
lung_cancer_correlation_matrix = pd.DataFrame(np.zeros((len(lung_cancer_correlation.columns), len(lung_cancer_correlation.columns))),
                                              columns=lung_cancer_correlation.columns,
                                              index=lung_cancer_correlation.columns)

##################################
# Calculating different types
# of correlation coefficients
# per variable type
##################################
for i in range(len(lung_cancer_correlation.columns)):
    for j in range(i, len(lung_cancer_correlation.columns)):
        if i == j:
            lung_cancer_correlation_matrix.iloc[i, j] = 1.0
        else:
            if lung_cancer_correlation.dtypes.iloc[i] == 'int64' and lung_cancer_correlation.dtypes.iloc[j] == 'int64':
                # Phi coefficient for two binary variables
                corr = lung_cancer_correlation.iloc[:, i].corr(lung_cancer_correlation.iloc[:, j])
            elif lung_cancer_correlation.dtypes.iloc[i] == 'float64' or lung_cancer_correlation.dtypes.iloc[j] == 'int64':
                # Point-biserial correlation for one continuous and one binary variable
                continuous_var = lung_cancer_correlation.iloc[:, i] if lung_cancer_correlation.dtypes.iloc[i] == 'float64' else lung_cancer_correlation.iloc[:, j]
                binary_var = lung_cancer_correlation.iloc[:, j] if lung_cancer_correlation.dtypes.iloc[j] == 'int64' else lung_cancer_correlation.iloc[:, i]
                corr, _ = pointbiserialr(continuous_var, binary_var)
            else:
                # Pearson correlation for two continuous variables
                corr = lung_cancer_correlation.iloc[:, i].corr(lung_cancer_correlation.iloc[:, j])
            lung_cancer_correlation_matrix.iloc[i, j] = corr
            lung_cancer_correlation_matrix.iloc[j, i] = corr

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric and categorical columns
##################################
plt.figure(figsize=(17, 8))
sns.heatmap(lung_cancer_correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

##################################
# Estimating the lung cancer prevalence
##################################
print('Lung Cancer Prevalence: ')
display(lung_cancer['LUNG_CANCER'].value_counts(normalize = True))

Lung Cancer Prevalence:

LUNG_CANCER
YES    0.873786
NO     0.126214
Name: proportion, dtype: float64

##################################
# Segregating the target
# and predictor variables
##################################
lung_cancer_predictors = lung_cancer.iloc[:,:-1].columns
lung_cancer_predictors_numeric = lung_cancer.iloc[:,:-1].loc[:,lung_cancer.iloc[:,:-1].columns == 'AGE'].columns
lung_cancer_predictors_categorical = lung_cancer.iloc[:,:-1].loc[:,lung_cancer.iloc[:,:-1].columns != 'AGE'].columns

##################################
# Segregating the target variable
# and numeric predictors
##################################
boxplot_y_variable = 'LUNG_CANCER'
boxplot_x_variable = lung_cancer_predictors_numeric.values[0]

##################################
# Evaluating the numeric predictors
# against the target variable
##################################
plt.figure(figsize=(7, 5))
plt.boxplot([group[boxplot_x_variable] for name, group in lung_cancer.groupby(boxplot_y_variable, observed=True)])
plt.title(f'{boxplot_y_variable} Versus {boxplot_x_variable}')
plt.xlabel(boxplot_y_variable)
plt.ylabel(boxplot_x_variable)
plt.xticks(range(1, len(lung_cancer[boxplot_y_variable].unique()) + 1), ['No', 'Yes'])
plt.show()

##################################
# Segregating the target variable
# and categorical predictors
##################################
proportion_y_variables = lung_cancer_predictors_categorical
proportion_x_variable = 'LUNG_CANCER'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 7
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 40))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(proportion_y_variables):
    ax = axes[i]
    category_counts = lung_cancer.groupby([proportion_x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{proportion_x_variable} Versus {y_variable}')
    ax.set_xlabel(proportion_x_variable)
    ax.set_ylabel('PROPORTIONS')
    ax.legend(loc="lower center")

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Computing the t-test 
# statistic and p-values
# between the target variable
# and numeric predictor columns
##################################
lung_cancer_numeric_ttest_target = {}
lung_cancer_numeric = lung_cancer.loc[:,(lung_cancer.columns == 'AGE') | (lung_cancer.columns == 'LUNG_CANCER')]
lung_cancer_numeric_columns = lung_cancer_predictors_numeric
for numeric_column in lung_cancer_numeric_columns:
    group_0 = lung_cancer_numeric[lung_cancer_numeric.loc[:,'LUNG_CANCER']=='NO']
    group_1 = lung_cancer_numeric[lung_cancer_numeric.loc[:,'LUNG_CANCER']=='YES']
    lung_cancer_numeric_ttest_target['LUNG_CANCER_' + numeric_column] = stats.ttest_ind(
        group_0[numeric_column], 
        group_1[numeric_column], 
        equal_var=True)

##################################
# Formulating the pairwise ttest summary
# between the target variable
# and numeric predictor columns
##################################
lung_cancer_numeric_summary = lung_cancer_numeric.from_dict(lung_cancer_numeric_ttest_target, orient='index')
lung_cancer_numeric_summary.columns = ['T.Test.Statistic', 'T.Test.PValue']
display(lung_cancer_numeric_summary.sort_values(by=['T.Test.PValue'], ascending=True).head(len(lung_cancer_predictors_numeric)))

##################################
# Computing the chisquare
# statistic and p-values
# between the target variable
# and categorical predictor columns
##################################
lung_cancer_categorical_chisquare_target = {}
lung_cancer_categorical = lung_cancer.loc[:,(lung_cancer.columns != 'AGE') | (lung_cancer.columns == 'LUNG_CANCER')]
lung_cancer_categorical_columns = lung_cancer_predictors_categorical
for categorical_column in lung_cancer_categorical_columns:
    contingency_table = pd.crosstab(lung_cancer_categorical[categorical_column], 
                                    lung_cancer_categorical['LUNG_CANCER'])
    lung_cancer_categorical_chisquare_target['LUNG_CANCER_' + categorical_column] = stats.chi2_contingency(
        contingency_table)[0:2]

##################################
# Formulating the pairwise chisquare summary
# between the target variable
# and categorical predictor columns
##################################
lung_cancer_categorical_summary = lung_cancer_categorical.from_dict(lung_cancer_categorical_chisquare_target, orient='index')
lung_cancer_categorical_summary.columns = ['ChiSquare.Test.Statistic', 'ChiSquare.Test.PValue']
display(lung_cancer_categorical_summary.sort_values(by=['ChiSquare.Test.PValue'], ascending=True).head(len(lung_cancer_predictors_categorical)))

##################################
# Creating a dataset copy and
# transforming all values to numeric
# prior to data splitting and modelling
##################################
pd.set_option('future.no_silent_downcasting', True)
lung_cancer_transformed = lung_cancer.copy()
lung_cancer_transformed_object = lung_cancer_transformed.iloc[:,2:15].columns
lung_cancer_transformed['GENDER'] = lung_cancer_transformed['GENDER'].astype('category')
lung_cancer_transformed['GENDER'] = lung_cancer_transformed['GENDER'].cat.rename_categories({'F': 0, 'M': 1})
lung_cancer_transformed['LUNG_CANCER'] = lung_cancer_transformed['LUNG_CANCER'].astype('category')
lung_cancer_transformed['LUNG_CANCER'] = lung_cancer_transformed['LUNG_CANCER'].cat.rename_categories({'NO': 0, 'YES': 1})
lung_cancer_transformed[lung_cancer_transformed_object] = lung_cancer_transformed[lung_cancer_transformed_object].replace({'Absent': 0, 'Present': 1})
display(lung_cancer_transformed)

##################################
# Saving the tranformed data
# to the DATASETS_PREPROCESSED_PATH
##################################
lung_cancer_transformed.to_csv(os.path.join("..", DATASETS_PREPROCESSED_PATH, "lung_cancer_transformed.csv"), index=False)

##################################
# Filtering out predictors that did not exhibit 
# sufficient discrimination of the target variable
# Saving the tranformed data
# to the DATASETS_PREPROCESSED_PATH
##################################
lung_cancer_filtered = lung_cancer_transformed.drop(['GENDER','CHRONIC_DISEASE', 'SHORTNESS_OF_BREATH', 'SMOKING', 'AGE'], axis=1)
lung_cancer_filtered.to_csv(os.path.join("..", DATASETS_FINAL_PATH, "lung_cancer_final.csv"), index=False)
display(lung_cancer_filtered)

##################################
# Creating a dataset copy
# of the filtered data
##################################
lung_cancer_final = lung_cancer_filtered.copy()

##################################
# Performing a general exploration
# of the final dataset
##################################
print('Final Dataset Dimensions: ')
display(lung_cancer_final.shape)

Final Dataset Dimensions:

(309, 11)

print('Target Variable Breakdown: ')
lung_cancer_breakdown = lung_cancer_final.groupby('LUNG_CANCER', observed=True).size().reset_index(name='Count')
lung_cancer_breakdown['Percentage'] = (lung_cancer_breakdown['Count'] / len(lung_cancer_final)) * 100
display(lung_cancer_breakdown)

Target Variable Breakdown:

##################################
# Formulating the train and test data
# from the final dataset
# by applying stratification and
# using a 70-30 ratio
##################################
lung_cancer_train_initial, lung_cancer_test = train_test_split(lung_cancer_final, 
                                                               test_size=0.25, 
                                                               stratify=lung_cancer_final['LUNG_CANCER'], 
                                                               random_state=88888888)

##################################
# Performing a general exploration
# of the initial training dataset
##################################
X_train_initial = lung_cancer_train_initial.drop('LUNG_CANCER', axis = 1)
y_train_initial = lung_cancer_train_initial['LUNG_CANCER']
print('Initial Training Dataset Dimensions: ')
display(X_train_initial.shape)
display(y_train_initial.shape)
print('Initial Training Target Variable Breakdown: ')
display(y_train_initial.value_counts())
print('Initial Training Target Variable Proportion: ')
display(y_train_initial.value_counts(normalize = True))

Initial Training Dataset Dimensions:

(231, 10)

(231,)

Initial Training Target Variable Breakdown:

LUNG_CANCER
1    202
0     29
Name: count, dtype: int64

Initial Training Target Variable Proportion:

LUNG_CANCER
1    0.874459
0    0.125541
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the test dataset
##################################
X_test = lung_cancer_test.drop('LUNG_CANCER', axis = 1)
y_test = lung_cancer_test['LUNG_CANCER']
print('Test Dataset Dimensions: ')
display(X_test.shape)
display(y_test.shape)
print('Test Target Variable Breakdown: ')
display(y_test.value_counts())
print('Test Target Variable Proportion: ')
display(y_test.value_counts(normalize = True))

Test Dataset Dimensions:

(78, 10)

(78,)

Test Target Variable Breakdown:

LUNG_CANCER
1    68
0    10
Name: count, dtype: int64

Test Target Variable Proportion:

LUNG_CANCER
1    0.871795
0    0.128205
Name: proportion, dtype: float64

##################################
# Formulating the train and validation data
# from the train dataset
# by applying stratification and
# using a 70-30 ratio
##################################
lung_cancer_train, lung_cancer_validation = train_test_split(lung_cancer_train_initial, 
                                                             test_size=0.25, 
                                                             stratify=lung_cancer_train_initial['LUNG_CANCER'], 
                                                             random_state=88888888)

##################################
# Performing a general exploration
# of the final training dataset
##################################
X_train = lung_cancer_train.drop('LUNG_CANCER', axis = 1)
y_train = lung_cancer_train['LUNG_CANCER']
print('Final Training Dataset Dimensions: ')
display(X_train.shape)
display(y_train.shape)
print('Final Training Target Variable Breakdown: ')
display(y_train.value_counts())
print('Final Training Target Variable Proportion: ')
display(y_train.value_counts(normalize = True))

Final Training Dataset Dimensions:

(173, 10)

(173,)

Final Training Target Variable Breakdown:

LUNG_CANCER
1    151
0     22
Name: count, dtype: int64

Final Training Target Variable Proportion:

LUNG_CANCER
1    0.872832
0    0.127168
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the validation dataset
##################################
X_validation = lung_cancer_validation.drop('LUNG_CANCER', axis = 1)
y_validation = lung_cancer_validation['LUNG_CANCER']
print('Validation Dataset Dimensions: ')
display(X_validation.shape)
display(y_validation.shape)
print('Validation Target Variable Breakdown: ')
display(y_validation.value_counts())
print('Validation Target Variable Proportion: ')
display(y_validation.value_counts(normalize = True))

Validation Dataset Dimensions:

(58, 10)

(58,)

Validation Target Variable Breakdown:

LUNG_CANCER
1    51
0     7
Name: count, dtype: int64

Validation Target Variable Proportion:

LUNG_CANCER
1    0.87931
0    0.12069
Name: proportion, dtype: float64

##################################
# Initiating an oversampling instance
# on the training data using
# Synthetic Minority Oversampling Technique
##################################
smote = SMOTE(random_state = 88888888)
X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)
print('SMOTE-Upsampled Training Dataset Dimensions: ')
display(X_train_smote.shape)
display(y_train_smote.shape)
print('SMOTE-Upsampled Training Target Variable Breakdown: ')
display(y_train_smote.value_counts())
print('SMOTE-Upsampled Training Target Variable Proportion: ')
display(y_train_smote.value_counts(normalize = True))

SMOTE-Upsampled Training Dataset Dimensions:

(302, 10)

(302,)

SMOTE-Upsampled Training Target Variable Breakdown:

LUNG_CANCER
0    151
1    151
Name: count, dtype: int64

SMOTE-Upsampled Training Target Variable Proportion:

LUNG_CANCER
0    0.5
1    0.5
Name: proportion, dtype: float64

##################################
# Initiating an undersampling instance
# on the training data using
# Condense Nearest Neighbors
##################################
cnn = CondensedNearestNeighbour(random_state = 88888888, n_neighbors=3)
X_train_cnn, y_train_cnn = cnn.fit_resample(X_train,y_train)
print('Downsampled Training Dataset Dimensions: ')
display(X_train_cnn.shape)
display(y_train_cnn.shape)
print('Downsampled Training Target Variable Breakdown: ')
display(y_train_cnn.value_counts())
print('Downsampled Training Target Variable Proportion: ')
display(y_train_cnn.value_counts(normalize = True))

Downsampled Training Dataset Dimensions:

(61, 10)

(61,)

Downsampled Training Target Variable Breakdown:

LUNG_CANCER
1    39
0    22
Name: count, dtype: int64

Downsampled Training Target Variable Proportion:

LUNG_CANCER
1    0.639344
0    0.360656
Name: proportion, dtype: float64

##################################
# Saving the training data
# to the DATASETS_FINAL_TRAIN_PATH
# and DATASETS_FINAL_TRAIN_FEATURES_PATH
# and DATASETS_FINAL_TRAIN_TARGET_PATH
##################################
lung_cancer_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_PATH, "lung_cancer_train.csv"), index=False)
X_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_FEATURES_PATH, "X_train.csv"), index=False)
y_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_TARGET_PATH, "y_train.csv"), index=False)
X_train_smote.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_FEATURES_PATH, "X_train_smote.csv"), index=False)
y_train_smote.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_TARGET_PATH, "y_train_smote.csv"), index=False)
X_train_cnn.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_FEATURES_PATH, "X_train_cnn.csv"), index=False)
y_train_cnn.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_TARGET_PATH, "y_train_cnn.csv"), index=False)

##################################
# Saving the validation data
# to the DATASETS_FINAL_VALIDATION_PATH
# and DATASETS_FINAL_VALIDATION_FEATURE_PATH
# and DATASETS_FINAL_VALIDATION_TARGET_PATH
##################################
lung_cancer_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_PATH, "lung_cancer_validation.csv"), index=False)
X_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_FEATURES_PATH, "X_validation.csv"), index=False)
y_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_TARGET_PATH, "y_validation.csv"), index=False)

##################################
# Saving the test data
# to the DATASETS_FINAL_TEST_PATH
# and DATASETS_FINAL_TEST_FEATURES_PATH
# and DATASETS_FINAL_TEST_TARGET_PATH
##################################
lung_cancer_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_PATH, "lung_cancer_test.csv"), index=False)
X_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_FEATURES_PATH, "X_test.csv"), index=False)
y_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_TARGET_PATH, "y_test.csv"), index=False)

##################################
# Defining the modelling pipeline
# using the logistic regression structure
##################################
individual_pipeline = Pipeline([('individual_model', LogisticRegression(solver='saga', 
                                                             random_state=88888888, 
                                                             max_iter=5000))])

##################################
# Defining the hyperparameters for grid search
# including the regularization penalties
# and class weights for unbalanced class
##################################
individual_unbalanced_class_hyperparameter_grid = {'individual_model__penalty': ['l1', 'l2', None],
                                                   'individual_model__class_weight': ['balanced']}

##################################
# Setting up the GridSearchCV with 5-fold cross-validation
# and using F1 score as the model evaluation metric
##################################
individual_unbalanced_class_grid_search = GridSearchCV(estimator=individual_pipeline,
                                                       param_grid=individual_unbalanced_class_hyperparameter_grid,
                                                       scoring='f1',
                                                       cv=5, 
                                                       n_jobs=-1,
                                                       verbose=1)

##################################
# Defining the hyperparameters for grid search
# including the regularization penalties
# and class weights for unbalanced class
##################################
individual_balanced_class_hyperparameter_grid = {'individual_model__penalty': ['l1', 'l2', None],
                                                 'individual_model__class_weight': [None]}

##################################
# Setting up the GridSearchCV with 5-fold cross-validation
# and using F1 score as the model evaluation metric
##################################
individual_balanced_class_grid_search = GridSearchCV(estimator=individual_pipeline,
                                                     param_grid=individual_balanced_class_hyperparameter_grid,
                                                     scoring='f1',
                                                     cv=5, 
                                                     n_jobs=-1,
                                                     verbose=1)

##################################
# Defining the base learners
# for the stacked classifier
# composed of decision tree,
# random forest, and support vector machine
##################################
stacked_unbalanced_class_base_learners = [('dt', DecisionTreeClassifier(class_weight='balanced',
                                                                         criterion='entropy',
                                                                         min_samples_leaf=3,
                                                                         random_state=88888888)),
                                           ('rf', RandomForestClassifier(class_weight='balanced',
                                                                         criterion='entropy',
                                                                         max_features='sqrt',
                                                                         min_samples_leaf=3,
                                                                         random_state=88888888)),
                                           ('svm', SVC(class_weight='balanced',
                                                       probability=True,
                                                       kernel='linear',
                                                       random_state=88888888))]

##################################
# Defining the meta-learner
# using the logistic regression structure
##################################
stacked_unbalanced_class_meta_learner = LogisticRegression(solver='saga', 
                                                           random_state=88888888,
                                                           max_iter=5000)

##################################
# Defining the stacking model
# using the logistic regression structure
##################################
stacked_unbalanced_class_model = StackingClassifier(estimators=stacked_unbalanced_class_base_learners,
                                                    final_estimator=stacked_unbalanced_class_meta_learner)

##################################
# Defining the modelling pipeline
# for the stacked classifier
# composed of decision tree,
# random forest, and support vector machine
# using the logistic regression structure
##################################
stacked_unbalanced_class_pipeline = Pipeline([('stacked_model', stacked_unbalanced_class_model)])

##################################
# Defining the hyperparameters for grid search
# including the regularization penalties
# and class weights for unbalanced class
##################################
stacked_unbalanced_class_hyperparameter_grid = {'stacked_model__dt__max_depth': [3, 5],
                                                'stacked_model__rf__max_depth': [3, 5],
                                                'stacked_model__svm__C': [0.50, 1.00],
                                                'stacked_model__final_estimator__penalty': ['l1', 'l2', None],
                                                'stacked_model__final_estimator__class_weight': ['balanced']}

##################################
# Setting up the GridSearchCV with 5-fold cross-validation
# and using F1 score as the model evaluation metric
##################################
stacked_unbalanced_class_grid_search = GridSearchCV(estimator=stacked_unbalanced_class_pipeline,
                                                    param_grid=stacked_unbalanced_class_hyperparameter_grid,
                                                    scoring='f1',
                                                    cv=5,
                                                    n_jobs=-1,
                                                    verbose=1)

##################################
# Defining the base learners
# for the stacked classifier
# composed of decision tree,
# random forest, and support vector machine
##################################
stacked_balanced_class_base_learners = [('dt', DecisionTreeClassifier(class_weight=None,
                                                                         criterion='entropy',
                                                                         min_samples_leaf=3,
                                                                         random_state=88888888)),
                                           ('rf', RandomForestClassifier(class_weight=None,
                                                                         criterion='entropy',
                                                                         max_features='sqrt',
                                                                         min_samples_leaf=3,
                                                                         random_state=88888888)),
                                           ('svm', SVC(class_weight=None,
                                                       probability=True,
                                                       kernel='linear',
                                                       random_state=88888888))]

##################################
# Defining the meta-learner
# using the logistic regression structure
##################################
stacked_balanced_class_meta_learner = LogisticRegression(solver='saga', 
                                                           random_state=88888888,
                                                           max_iter=5000)

##################################
# Defining the stacking model
# using the logistic regression structure
##################################
stacked_balanced_class_model = StackingClassifier(estimators=stacked_balanced_class_base_learners,
                                                    final_estimator=stacked_balanced_class_meta_learner)

##################################
# Defining the modelling pipeline
# for the stacked classifier
# composed of decision tree,
# random forest, and support vector machine
# using the logistic regression structure
##################################
stacked_balanced_class_pipeline = Pipeline([('stacked_model', stacked_balanced_class_model)])

##################################
# Defining the hyperparameters for grid search
# including the regularization penalties
# and class weights for balanced class
##################################
stacked_balanced_class_hyperparameter_grid = {'stacked_model__dt__max_depth': [3, 5],
                                                'stacked_model__rf__max_depth': [3, 5],
                                                'stacked_model__svm__C': [0.50, 1.00],
                                                'stacked_model__final_estimator__penalty': ['l1', 'l2', None],
                                                'stacked_model__final_estimator__class_weight': [None]}

##################################
# Setting up the GridSearchCV with 5-fold cross-validation
# and using F1 score as the model evaluation metric
##################################
stacked_balanced_class_grid_search = GridSearchCV(estimator=stacked_balanced_class_pipeline,
                                                    param_grid=stacked_balanced_class_hyperparameter_grid,
                                                    scoring='f1',
                                                    cv=5,
                                                    n_jobs=-1,
                                                    verbose=1)

##################################
# Fitting the model on the 
# original training data
##################################
individual_unbalanced_class_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': ['balanced'],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': ['balanced'],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

Pipeline(steps=[('individual_model',
                 LogisticRegression(class_weight='balanced', max_iter=5000,
                                    random_state=88888888, solver='saga'))])

LogisticRegression(class_weight='balanced', max_iter=5000,
                   random_state=88888888, solver='saga')

##################################
# Identifying the best model
##################################
individual_unbalanced_class_best_model_original = individual_unbalanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
individual_unbalanced_class_best_model_original_f1_cv = individual_unbalanced_class_grid_search.best_score_
individual_unbalanced_class_best_model_original_f1_train = f1_score(y_train, individual_unbalanced_class_best_model_original.predict(X_train))
individual_unbalanced_class_best_model_original_f1_validation = f1_score(y_validation, individual_unbalanced_class_best_model_original.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Individual Model using the Original Train Data: ')
print(f"Best Individual Model Parameters: {individual_unbalanced_class_grid_search.best_params_}")

Best Individual Model using the Original Train Data: 
Best Individual Model Parameters: {'individual_model__class_weight': 'balanced', 'individual_model__penalty': 'l2'}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {individual_unbalanced_class_best_model_original_f1_cv:.4f}")
print(f"F1 Score on Training Data: {individual_unbalanced_class_best_model_original_f1_train:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train, individual_unbalanced_class_best_model_original.predict(X_train)))

F1 Score on Cross-Validated Data: 0.9116
F1 Score on Training Data: 0.9306

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.53      0.86      0.66        22
           1       0.98      0.89      0.93       151

    accuracy                           0.88       173
   macro avg       0.75      0.88      0.79       173
weighted avg       0.92      0.88      0.90       173

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train, individual_unbalanced_class_best_model_original.predict(X_train))
cm_normalized = confusion_matrix(y_train, individual_unbalanced_class_best_model_original.predict(X_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {individual_unbalanced_class_best_model_original_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, individual_unbalanced_class_best_model_original.predict(X_validation)))

F1 Score on Validation Data: 0.9495

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.60      0.86      0.71         7
           1       0.98      0.92      0.95        51

    accuracy                           0.91        58
   macro avg       0.79      0.89      0.83        58
weighted avg       0.93      0.91      0.92        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, individual_unbalanced_class_best_model_original.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, individual_unbalanced_class_best_model_original.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
individual_unbalanced_class_best_model_original_logit_values = individual_unbalanced_class_best_model_original.decision_function(X_train)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
individual_unbalanced_class_best_model_original_probabilities = individual_unbalanced_class_best_model_original.predict_proba(X_train)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
individual_unbalanced_class_best_model_original_sorted_indices = np.argsort(individual_unbalanced_class_best_model_original_logit_values)
individual_unbalanced_class_best_model_original_logit_values_sorted = individual_unbalanced_class_best_model_original_logit_values[individual_unbalanced_class_best_model_original_sorted_indices]
individual_unbalanced_class_best_model_original_probabilities_sorted = individual_unbalanced_class_best_model_original_probabilities[individual_unbalanced_class_best_model_original_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(individual_unbalanced_class_best_model_original_logit_values_sorted, 
         individual_unbalanced_class_best_model_original_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train == 0
target_1_indices = y_train == 1
plt.scatter(individual_unbalanced_class_best_model_original_logit_values[target_0_indices], 
            individual_unbalanced_class_best_model_original_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(individual_unbalanced_class_best_model_original_logit_values[target_1_indices], 
            individual_unbalanced_class_best_model_original_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Original Training Data): Individual Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best individual model
# developed from the original training data
################################## 
joblib.dump(individual_unbalanced_class_best_model_original, 
            os.path.join("..", MODELS_PATH, "individual_unbalanced_class_best_model_original.pkl"))

['..\\models\\individual_unbalanced_class_best_model_original.pkl']

##################################
# Fitting the model on the 
# original training data
##################################
stacked_unbalanced_class_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(class_weight='b...
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': ['balanced'],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(class_weight='b...
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': ['balanced'],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

Pipeline(steps=[('stacked_model',
                 StackingClassifier(estimators=[('dt',
                                                 DecisionTreeClassifier(class_weight='balanced',
                                                                        criterion='entropy',
                                                                        max_depth=3,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('rf',
                                                 RandomForestClassifier(class_weight='balanced',
                                                                        criterion='entropy',
                                                                        max_depth=5,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('svm',
                                                 SVC(C=0.5,
                                                     class_weight='balanced',
                                                     kernel='linear',
                                                     probability=True,
                                                     random_state=88888888))],
                                    final_estimator=LogisticRegression(class_weight='balanced',
                                                                       max_iter=5000,
                                                                       penalty='l1',
                                                                       random_state=88888888,
                                                                       solver='saga')))])

StackingClassifier(estimators=[('dt',
                                DecisionTreeClassifier(class_weight='balanced',
                                                       criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('rf',
                                RandomForestClassifier(class_weight='balanced',
                                                       criterion='entropy',
                                                       max_depth=5,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('svm',
                                SVC(C=0.5, class_weight='balanced',
                                    kernel='linear', probability=True,
                                    random_state=88888888))],
                   final_estimator=LogisticRegression(class_weight='balanced',
                                                      max_iter=5000,
                                                      penalty='l1',
                                                      random_state=88888888,
                                                      solver='saga'))

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=3, min_samples_leaf=3, random_state=88888888)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=5, min_samples_leaf=3, random_state=88888888)

SVC(C=0.5, class_weight='balanced', kernel='linear', probability=True,
    random_state=88888888)

LogisticRegression(class_weight='balanced', max_iter=5000, penalty='l1',
                   random_state=88888888, solver='saga')

##################################
# Identifying the best model
##################################
stacked_unbalanced_class_best_model_original = stacked_unbalanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_unbalanced_class_best_model_original_f1_cv = stacked_unbalanced_class_grid_search.best_score_
stacked_unbalanced_class_best_model_original_f1_train = f1_score(y_train, stacked_unbalanced_class_best_model_original.predict(X_train))
stacked_unbalanced_class_best_model_original_f1_validation = f1_score(y_validation, stacked_unbalanced_class_best_model_original.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Model using the Original Train Data: ')
print(f"Best Stacked Model Parameters: {stacked_unbalanced_class_grid_search.best_params_}")

Best Stacked Model using the Original Train Data: 
Best Stacked Model Parameters: {'stacked_model__dt__max_depth': 3, 'stacked_model__final_estimator__class_weight': 'balanced', 'stacked_model__final_estimator__penalty': 'l1', 'stacked_model__rf__max_depth': 5, 'stacked_model__svm__C': 0.5}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_unbalanced_class_best_model_original_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_unbalanced_class_best_model_original_f1_train:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train, stacked_unbalanced_class_best_model_original.predict(X_train)))

F1 Score on Cross-Validated Data: 0.9125
F1 Score on Training Data: 0.9404

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.56      1.00      0.72        22
           1       1.00      0.89      0.94       151

    accuracy                           0.90       173
   macro avg       0.78      0.94      0.83       173
weighted avg       0.94      0.90      0.91       173

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train, stacked_unbalanced_class_best_model_original.predict(X_train))
cm_normalized = confusion_matrix(y_train, stacked_unbalanced_class_best_model_original.predict(X_train), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {stacked_unbalanced_class_best_model_original_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, stacked_unbalanced_class_best_model_original.predict(X_validation)))

F1 Score on Validation Data: 0.9149

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.47      1.00      0.64         7
           1       1.00      0.84      0.91        51

    accuracy                           0.86        58
   macro avg       0.73      0.92      0.78        58
weighted avg       0.94      0.86      0.88        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, stacked_unbalanced_class_best_model_original.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, stacked_unbalanced_class_best_model_original.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
stacked_unbalanced_class_best_model_original_logit_values = stacked_unbalanced_class_best_model_original.decision_function(X_train)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
stacked_unbalanced_class_best_model_original_probabilities = stacked_unbalanced_class_best_model_original.predict_proba(X_train)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
stacked_unbalanced_class_best_model_original_sorted_indices = np.argsort(stacked_unbalanced_class_best_model_original_logit_values)
stacked_unbalanced_class_best_model_original_logit_values_sorted = stacked_unbalanced_class_best_model_original_logit_values[stacked_unbalanced_class_best_model_original_sorted_indices]
stacked_unbalanced_class_best_model_original_probabilities_sorted = stacked_unbalanced_class_best_model_original_probabilities[stacked_unbalanced_class_best_model_original_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_unbalanced_class_best_model_original_logit_values_sorted, 
         stacked_unbalanced_class_best_model_original_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train == 0
target_1_indices = y_train == 1
plt.scatter(stacked_unbalanced_class_best_model_original_logit_values[target_0_indices], 
            stacked_unbalanced_class_best_model_original_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(stacked_unbalanced_class_best_model_original_logit_values[target_1_indices], 
            stacked_unbalanced_class_best_model_original_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Original Training Data): Stacked Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best stacked model
# developed from the original training data
################################## 
joblib.dump(stacked_unbalanced_class_best_model_original, 
            os.path.join("..", MODELS_PATH, "stacked_unbalanced_class_best_model_original.pkl"))

['..\\models\\stacked_unbalanced_class_best_model_original.pkl']

##################################
# Fitting the model on the 
# upsampled training data
##################################
individual_balanced_class_grid_search.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 3 candidates, totalling 15 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': [None],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': [None],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

Pipeline(steps=[('individual_model',
                 LogisticRegression(max_iter=5000, penalty='l1',
                                    random_state=88888888, solver='saga'))])

LogisticRegression(max_iter=5000, penalty='l1', random_state=88888888,
                   solver='saga')

##################################
# Identifying the best model
##################################
individual_balanced_class_best_model_upsampled = individual_balanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
individual_balanced_class_best_model_upsampled_f1_cv = individual_balanced_class_grid_search.best_score_
individual_balanced_class_best_model_upsampled_f1_train_smote = f1_score(y_train_smote, individual_balanced_class_best_model_upsampled.predict(X_train_smote))
individual_balanced_class_best_model_upsampled_f1_validation = f1_score(y_validation, individual_balanced_class_best_model_upsampled.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Individual Model using the SMOTE-Upsampled Train Data: ')
print(f"Best Individual Model Parameters: {individual_balanced_class_grid_search.best_params_}")

Best Individual Model using the SMOTE-Upsampled Train Data: 
Best Individual Model Parameters: {'individual_model__class_weight': None, 'individual_model__penalty': 'l1'}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {individual_balanced_class_best_model_upsampled_f1_cv:.4f}")
print(f"F1 Score on Training Data: {individual_balanced_class_best_model_upsampled_f1_train_smote:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train_smote, individual_balanced_class_best_model_upsampled.predict(X_train_smote)))

F1 Score on Cross-Validated Data: 0.9109
F1 Score on Training Data: 0.9122

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92       151
           1       0.93      0.89      0.91       151

    accuracy                           0.91       302
   macro avg       0.91      0.91      0.91       302
weighted avg       0.91      0.91      0.91       302

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train_smote, individual_balanced_class_best_model_upsampled.predict(X_train_smote))
cm_normalized = confusion_matrix(y_train_smote, individual_balanced_class_best_model_upsampled.predict(X_train_smote), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {individual_balanced_class_best_model_upsampled_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, individual_balanced_class_best_model_upsampled.predict(X_validation)))

F1 Score on Validation Data: 0.9278

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.50      0.86      0.63         7
           1       0.98      0.88      0.93        51

    accuracy                           0.88        58
   macro avg       0.74      0.87      0.78        58
weighted avg       0.92      0.88      0.89        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, individual_balanced_class_best_model_upsampled.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, individual_balanced_class_best_model_upsampled.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
individual_balanced_class_best_model_upsampled_logit_values = individual_balanced_class_best_model_upsampled.decision_function(X_train_smote)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
individual_balanced_class_best_model_upsampled_probabilities = individual_balanced_class_best_model_upsampled.predict_proba(X_train_smote)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
individual_balanced_class_best_model_upsampled_sorted_indices = np.argsort(individual_balanced_class_best_model_upsampled_logit_values)
individual_balanced_class_best_model_upsampled_logit_values_sorted = individual_balanced_class_best_model_upsampled_logit_values[individual_balanced_class_best_model_upsampled_sorted_indices]
individual_balanced_class_best_model_upsampled_probabilities_sorted = individual_balanced_class_best_model_upsampled_probabilities[individual_balanced_class_best_model_upsampled_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(individual_balanced_class_best_model_upsampled_logit_values_sorted, 
         individual_balanced_class_best_model_upsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train_smote == 0
target_1_indices = y_train_smote == 1
plt.scatter(individual_balanced_class_best_model_upsampled_logit_values[target_0_indices], 
            individual_balanced_class_best_model_upsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(individual_balanced_class_best_model_upsampled_logit_values[target_1_indices], 
            individual_balanced_class_best_model_upsampled_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Upsampled Training Data): Individual Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best individual model
# developed from the upsampled training data
################################## 
joblib.dump(individual_balanced_class_best_model_upsampled, 
            os.path.join("..", MODELS_PATH, "individual_balanced_class_best_model_upsampled.pkl"))

['..\\models\\individual_balanced_class_best_model_upsampled.pkl']

##################################
# Fitting the model on the 
# upsampled training data
##################################
stacked_balanced_class_grid_search.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(kernel='linear',
                                                                            probability=True,
                                                                            random_state=88888888))],
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': [None],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(kernel='linear',
                                                                            probability=True,
                                                                            random_state=88888888))],
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': [None],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

Pipeline(steps=[('stacked_model',
                 StackingClassifier(estimators=[('dt',
                                                 DecisionTreeClassifier(criterion='entropy',
                                                                        max_depth=3,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('rf',
                                                 RandomForestClassifier(criterion='entropy',
                                                                        max_depth=5,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('svm',
                                                 SVC(kernel='linear',
                                                     probability=True,
                                                     random_state=88888888))],
                                    final_estimator=LogisticRegression(max_iter=5000,
                                                                       penalty=None,
                                                                       random_state=88888888,
                                                                       solver='saga')))])

StackingClassifier(estimators=[('dt',
                                DecisionTreeClassifier(criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('rf',
                                RandomForestClassifier(criterion='entropy',
                                                       max_depth=5,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('svm',
                                SVC(kernel='linear', probability=True,
                                    random_state=88888888))],
                   final_estimator=LogisticRegression(max_iter=5000,
                                                      penalty=None,
                                                      random_state=88888888,
                                                      solver='saga'))

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=3,
                       random_state=88888888)

RandomForestClassifier(criterion='entropy', max_depth=5, min_samples_leaf=3,
                       random_state=88888888)

SVC(kernel='linear', probability=True, random_state=88888888)

LogisticRegression(max_iter=5000, penalty=None, random_state=88888888,
                   solver='saga')

##################################
# Identifying the best model
##################################
stacked_balanced_class_best_model_upsampled = stacked_balanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_balanced_class_best_model_upsampled_f1_cv = stacked_balanced_class_grid_search.best_score_
stacked_balanced_class_best_model_upsampled_f1_train_smote = f1_score(y_train_smote, stacked_balanced_class_best_model_upsampled.predict(X_train_smote))
stacked_balanced_class_best_model_upsampled_f1_validation = f1_score(y_validation, stacked_balanced_class_best_model_upsampled.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Model using the SMOTE-Upsampled Train Data: ')
print(f"Best Stacked Model Parameters: {stacked_balanced_class_grid_search.best_params_}")

Best Stacked Model using the SMOTE-Upsampled Train Data: 
Best Stacked Model Parameters: {'stacked_model__dt__max_depth': 3, 'stacked_model__final_estimator__class_weight': None, 'stacked_model__final_estimator__penalty': None, 'stacked_model__rf__max_depth': 5, 'stacked_model__svm__C': 1.0}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_balanced_class_best_model_upsampled_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_balanced_class_best_model_upsampled_f1_train_smote:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train_smote, stacked_balanced_class_best_model_upsampled.predict(X_train_smote)))

F1 Score on Cross-Validated Data: 0.9489
F1 Score on Training Data: 0.9568

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       151
           1       0.96      0.95      0.96       151

    accuracy                           0.96       302
   macro avg       0.96      0.96      0.96       302
weighted avg       0.96      0.96      0.96       302

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train_smote, stacked_balanced_class_best_model_upsampled.predict(X_train_smote))
cm_normalized = confusion_matrix(y_train_smote, stacked_balanced_class_best_model_upsampled.predict(X_train_smote), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {stacked_balanced_class_best_model_upsampled_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, stacked_balanced_class_best_model_upsampled.predict(X_validation)))

F1 Score on Validation Data: 0.9615

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.94      0.98      0.96        51

    accuracy                           0.93        58
   macro avg       0.87      0.78      0.81        58
weighted avg       0.93      0.93      0.93        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, stacked_balanced_class_best_model_upsampled.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, stacked_balanced_class_best_model_upsampled.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
stacked_balanced_class_best_model_upsampled_logit_values = stacked_balanced_class_best_model_upsampled.decision_function(X_train_smote)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
stacked_balanced_class_best_model_upsampled_probabilities = stacked_balanced_class_best_model_upsampled.predict_proba(X_train_smote)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
stacked_balanced_class_best_model_upsampled_sorted_indices = np.argsort(stacked_balanced_class_best_model_upsampled_logit_values)
stacked_balanced_class_best_model_upsampled_logit_values_sorted = stacked_balanced_class_best_model_upsampled_logit_values[stacked_balanced_class_best_model_upsampled_sorted_indices]
stacked_balanced_class_best_model_upsampled_probabilities_sorted = stacked_balanced_class_best_model_upsampled_probabilities[stacked_balanced_class_best_model_upsampled_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_balanced_class_best_model_upsampled_logit_values_sorted, 
         stacked_balanced_class_best_model_upsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train_smote == 0
target_1_indices = y_train_smote == 1
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_0_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_1_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Upsampled Training Data): Stacked Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best stacked model
# developed from the upsampled training data
################################## 
joblib.dump(stacked_balanced_class_best_model_upsampled, 
            os.path.join("..", MODELS_PATH, "stacked_balanced_class_best_model_upsampled.pkl"))

['..\\models\\stacked_balanced_class_best_model_upsampled.pkl']

##################################
# Fitting the model on the 
# downsampled training data
##################################
individual_unbalanced_class_grid_search.fit(X_train_cnn, y_train_cnn)

Fitting 5 folds for each of 3 candidates, totalling 15 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': ['balanced'],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('individual_model',
                                        LogisticRegression(max_iter=5000,
                                                           random_state=88888888,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'individual_model__class_weight': ['balanced'],
                         'individual_model__penalty': ['l1', 'l2', None]},
             scoring='f1', verbose=1)

Pipeline(steps=[('individual_model',
                 LogisticRegression(class_weight='balanced', max_iter=5000,
                                    random_state=88888888, solver='saga'))])

LogisticRegression(class_weight='balanced', max_iter=5000,
                   random_state=88888888, solver='saga')

##################################
# Identifying the best model
##################################
individual_unbalanced_class_best_model_downsampled = individual_unbalanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
individual_unbalanced_class_best_model_downsampled_f1_cv = individual_unbalanced_class_grid_search.best_score_
individual_unbalanced_class_best_model_downsampled_f1_train_cnn = f1_score(y_train_cnn, individual_unbalanced_class_best_model_downsampled.predict(X_train_cnn))
individual_unbalanced_class_best_model_downsampled_f1_validation = f1_score(y_validation, individual_unbalanced_class_best_model_downsampled.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Individual Model using the CNN-Downsampled Train Data: ')
print(f"Best Individual Model Parameters: {individual_unbalanced_class_grid_search.best_params_}")

Best Individual Model using the CNN-Downsampled Train Data: 
Best Individual Model Parameters: {'individual_model__class_weight': 'balanced', 'individual_model__penalty': 'l2'}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {individual_unbalanced_class_best_model_downsampled_f1_cv:.4f}")
print(f"F1 Score on Training Data: {individual_unbalanced_class_best_model_downsampled_f1_train_cnn:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train_cnn, individual_unbalanced_class_best_model_downsampled.predict(X_train_cnn)))

F1 Score on Cross-Validated Data: 0.7537
F1 Score on Training Data: 0.8533

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.72      0.82      0.77        22
           1       0.89      0.82      0.85        39

    accuracy                           0.82        61
   macro avg       0.80      0.82      0.81        61
weighted avg       0.83      0.82      0.82        61

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train_cnn, individual_unbalanced_class_best_model_downsampled.predict(X_train_cnn))
cm_normalized = confusion_matrix(y_train_cnn, individual_unbalanced_class_best_model_downsampled.predict(X_train_cnn), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {individual_unbalanced_class_best_model_downsampled_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, individual_unbalanced_class_best_model_downsampled.predict(X_validation)))

F1 Score on Validation Data: 0.9709

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.83      0.71      0.77         7
           1       0.96      0.98      0.97        51

    accuracy                           0.95        58
   macro avg       0.90      0.85      0.87        58
weighted avg       0.95      0.95      0.95        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, individual_unbalanced_class_best_model_downsampled.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, individual_unbalanced_class_best_model_downsampled.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Individual Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Individual Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
individual_unbalanced_class_best_model_downsampled_logit_values = individual_unbalanced_class_best_model_downsampled.decision_function(X_train_cnn)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
individual_unbalanced_class_best_model_downsampled_probabilities = individual_unbalanced_class_best_model_downsampled.predict_proba(X_train_cnn)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
individual_unbalanced_class_best_model_downsampled_sorted_indices = np.argsort(individual_unbalanced_class_best_model_downsampled_logit_values)
individual_unbalanced_class_best_model_downsampled_logit_values_sorted = individual_unbalanced_class_best_model_downsampled_logit_values[individual_unbalanced_class_best_model_downsampled_sorted_indices]
individual_unbalanced_class_best_model_downsampled_probabilities_sorted = individual_unbalanced_class_best_model_downsampled_probabilities[individual_unbalanced_class_best_model_downsampled_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(individual_unbalanced_class_best_model_downsampled_logit_values_sorted, 
         individual_unbalanced_class_best_model_downsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train_cnn == 0
target_1_indices = y_train_cnn == 1
plt.scatter(individual_unbalanced_class_best_model_downsampled_logit_values[target_0_indices], 
            individual_unbalanced_class_best_model_downsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(individual_unbalanced_class_best_model_downsampled_logit_values[target_1_indices], 
            individual_unbalanced_class_best_model_downsampled_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Downsampled Training Data): Individual Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best individual model
# developed from the downsampled training data
################################## 
joblib.dump(individual_unbalanced_class_best_model_downsampled, 
            os.path.join("..", MODELS_PATH, "individual_unbalanced_class_best_model_downsampled.pkl"))

['..\\models\\individual_unbalanced_class_best_model_downsampled.pkl']

##################################
# Fitting the model on the 
# downsampled training data
##################################
stacked_unbalanced_class_grid_search.fit(X_train_cnn, y_train_cnn)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(class_weight='b...
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': ['balanced'],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('stacked_model',
                                        StackingClassifier(estimators=[('dt',
                                                                        DecisionTreeClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('rf',
                                                                        RandomForestClassifier(class_weight='balanced',
                                                                                               criterion='entropy',
                                                                                               min_samples_leaf=3,
                                                                                               random_state=88888888)),
                                                                       ('svm',
                                                                        SVC(class_weight='b...
                                                           final_estimator=LogisticRegression(max_iter=5000,
                                                                                              random_state=88888888,
                                                                                              solver='saga')))]),
             n_jobs=-1,
             param_grid={'stacked_model__dt__max_depth': [3, 5],
                         'stacked_model__final_estimator__class_weight': ['balanced'],
                         'stacked_model__final_estimator__penalty': ['l1', 'l2',
                                                                     None],
                         'stacked_model__rf__max_depth': [3, 5],
                         'stacked_model__svm__C': [0.5, 1.0]},
             scoring='f1', verbose=1)

Pipeline(steps=[('stacked_model',
                 StackingClassifier(estimators=[('dt',
                                                 DecisionTreeClassifier(class_weight='balanced',
                                                                        criterion='entropy',
                                                                        max_depth=3,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('rf',
                                                 RandomForestClassifier(class_weight='balanced',
                                                                        criterion='entropy',
                                                                        max_depth=3,
                                                                        min_samples_leaf=3,
                                                                        random_state=88888888)),
                                                ('svm',
                                                 SVC(class_weight='balanced',
                                                     kernel='linear',
                                                     probability=True,
                                                     random_state=88888888))],
                                    final_estimator=LogisticRegression(class_weight='balanced',
                                                                       max_iter=5000,
                                                                       penalty=None,
                                                                       random_state=88888888,
                                                                       solver='saga')))])

StackingClassifier(estimators=[('dt',
                                DecisionTreeClassifier(class_weight='balanced',
                                                       criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('rf',
                                RandomForestClassifier(class_weight='balanced',
                                                       criterion='entropy',
                                                       max_depth=3,
                                                       min_samples_leaf=3,
                                                       random_state=88888888)),
                               ('svm',
                                SVC(class_weight='balanced', kernel='linear',
                                    probability=True, random_state=88888888))],
                   final_estimator=LogisticRegression(class_weight='balanced',
                                                      max_iter=5000,
                                                      penalty=None,
                                                      random_state=88888888,
                                                      solver='saga'))

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=3, min_samples_leaf=3, random_state=88888888)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=3, min_samples_leaf=3, random_state=88888888)

SVC(class_weight='balanced', kernel='linear', probability=True,
    random_state=88888888)

LogisticRegression(class_weight='balanced', max_iter=5000, penalty=None,
                   random_state=88888888, solver='saga')

##################################
# Identifying the best model
##################################
stacked_unbalanced_class_best_model_downsampled = stacked_unbalanced_class_grid_search.best_estimator_

##################################
# Evaluating the F1 scores
# on the training, cross-validation, and validation data
##################################
stacked_unbalanced_class_best_model_downsampled_f1_cv = stacked_unbalanced_class_grid_search.best_score_
stacked_unbalanced_class_best_model_downsampled_f1_train_cnn = f1_score(y_train_cnn, stacked_unbalanced_class_best_model_downsampled.predict(X_train_cnn))
stacked_unbalanced_class_best_model_downsampled_f1_validation = f1_score(y_validation, stacked_unbalanced_class_best_model_downsampled.predict(X_validation))

##################################
# Identifying the optimal model
##################################
print('Best Stacked Model using the CNN-Downsampled Train Data: ')
print(f"Best Stacked Model Parameters: {stacked_unbalanced_class_grid_search.best_params_}")

Best Stacked Model using the CNN-Downsampled Train Data: 
Best Stacked Model Parameters: {'stacked_model__dt__max_depth': 3, 'stacked_model__final_estimator__class_weight': 'balanced', 'stacked_model__final_estimator__penalty': None, 'stacked_model__rf__max_depth': 3, 'stacked_model__svm__C': 1.0}

##################################
# Summarizing the F1 score results
# on the training and cross-validated data
# to assess overfitting optimism
##################################
print(f"F1 Score on Cross-Validated Data: {stacked_unbalanced_class_best_model_downsampled_f1_cv:.4f}")
print(f"F1 Score on Training Data: {stacked_unbalanced_class_best_model_downsampled_f1_train_cnn:.4f}")
print("\nClassification Report on Training Data:\n", classification_report(y_train_cnn, stacked_unbalanced_class_best_model_downsampled.predict(X_train_cnn)))

F1 Score on Cross-Validated Data: 0.7531
F1 Score on Training Data: 0.8219

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.67      0.82      0.73        22
           1       0.88      0.77      0.82        39

    accuracy                           0.79        61
   macro avg       0.77      0.79      0.78        61
weighted avg       0.80      0.79      0.79        61

##################################
# Formulating the raw and normalized
# confusion matrices
# from the training data
##################################
cm_raw = confusion_matrix(y_train_cnn, stacked_unbalanced_class_best_model_downsampled.predict(X_train_cnn))
cm_normalized = confusion_matrix(y_train_cnn, stacked_unbalanced_class_best_model_downsampled.predict(X_train_cnn), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Training Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Training Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Summarizing the F1 score results
# and classification metrics
# on the validation data
##################################
print(f"F1 Score on Validation Data: {stacked_unbalanced_class_best_model_downsampled_f1_validation:.4f}")
print("\nClassification Report on Validation Data:\n", classification_report(y_validation, stacked_unbalanced_class_best_model_downsampled.predict(X_validation)))

F1 Score on Validation Data: 0.9524

Classification Report on Validation Data:
               precision    recall  f1-score   support

           0       0.75      0.43      0.55         7
           1       0.93      0.98      0.95        51

    accuracy                           0.91        58
   macro avg       0.84      0.70      0.75        58
weighted avg       0.90      0.91      0.90        58

##################################
# Formulating the raw and normalized
# confusion matrices
# from the validation data
##################################
cm_raw = confusion_matrix(y_validation, stacked_unbalanced_class_best_model_downsampled.predict(X_validation))
cm_normalized = confusion_matrix(y_validation, stacked_unbalanced_class_best_model_downsampled.predict(X_validation), normalize='true')
fig, ax = plt.subplots(1, 2, figsize=(17, 8))
sns.heatmap(cm_raw, annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title('Confusion Matrix (Raw Count): Best Stacked Model on Validation Data')
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', ax=ax[1])
ax[1].set_title('Confusion Matrix (Normalized): Best Stacked Model on Validation Data')
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

##################################
# Obtaining the logit values (log-odds)
# from the decision function for training data
##################################
stacked_unbalanced_class_best_model_downsampled_logit_values = stacked_unbalanced_class_best_model_downsampled.decision_function(X_train_cnn)

##################################
# Obtaining the estimated probabilities 
# for the positive class (LUNG_CANCER=YES) for training data
##################################
stacked_unbalanced_class_best_model_downsampled_probabilities = stacked_unbalanced_class_best_model_downsampled.predict_proba(X_train_cnn)[:, 1]

##################################
# Sorting the values to generate
# a smoother curve
##################################
stacked_unbalanced_class_best_model_downsampled_sorted_indices = np.argsort(stacked_unbalanced_class_best_model_downsampled_logit_values)
stacked_unbalanced_class_best_model_downsampled_logit_values_sorted = stacked_unbalanced_class_best_model_downsampled_logit_values[stacked_unbalanced_class_best_model_downsampled_sorted_indices]
stacked_unbalanced_class_best_model_downsampled_probabilities_sorted = stacked_unbalanced_class_best_model_downsampled_probabilities[stacked_unbalanced_class_best_model_downsampled_sorted_indices]

##################################
# Plotting the estimated logistic curve
# using the logit values
# and estimated probabilities
# obtained from the training data
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_unbalanced_class_best_model_downsampled_logit_values_sorted, 
         stacked_unbalanced_class_best_model_downsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-8.00, 8.00)
target_0_indices = y_train_cnn == 0
target_1_indices = y_train_cnn == 1
plt.scatter(stacked_unbalanced_class_best_model_downsampled_logit_values[target_0_indices], 
            stacked_unbalanced_class_best_model_downsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(stacked_unbalanced_class_best_model_downsampled_logit_values[target_1_indices], 
            stacked_unbalanced_class_best_model_downsampled_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Logistic Curve (Downsampled Training Data): Stacked Model')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Saving the best stacked model
# developed from the downsampled training data
################################## 
joblib.dump(stacked_unbalanced_class_best_model_downsampled, 
            os.path.join("..", MODELS_PATH, "stacked_unbalanced_class_best_model_downsampled.pkl"))

['..\\models\\stacked_unbalanced_class_best_model_downsampled.pkl']

##################################
# Gathering the F1 scores from 
# training, cross-validation and validation
##################################
set_labels = ['Train','Cross-Validation','Validation']
f1_plot = pd.DataFrame({'INDIVIDUAL_ORIGINAL_TRAIN': list([individual_unbalanced_class_best_model_original_f1_train,
                                                           individual_unbalanced_class_best_model_original_f1_cv,
                                                           individual_unbalanced_class_best_model_original_f1_validation]),
                        'STACKED_ORIGINAL_TRAIN': list([stacked_unbalanced_class_best_model_original_f1_train,
                                                        stacked_unbalanced_class_best_model_original_f1_cv,
                                                        stacked_unbalanced_class_best_model_original_f1_validation]),
                        'INDIVIDUAL_UPSAMPLED_TRAIN': list([individual_balanced_class_best_model_upsampled_f1_train_smote,
                                                           individual_balanced_class_best_model_upsampled_f1_cv,
                                                           individual_balanced_class_best_model_upsampled_f1_validation]),
                        'STACKED_UPSAMPLED_TRAIN': list([stacked_balanced_class_best_model_upsampled_f1_train_smote,
                                                        stacked_balanced_class_best_model_upsampled_f1_cv,
                                                        stacked_balanced_class_best_model_upsampled_f1_validation]),
                        'INDIVIDUAL_DOWNSAMPLED_TRAIN': list([individual_unbalanced_class_best_model_downsampled_f1_train_cnn,
                                                              individual_unbalanced_class_best_model_downsampled_f1_cv,
                                                              individual_unbalanced_class_best_model_downsampled_f1_validation]),
                        'STACKED_DOWNSAMPLED_TRAIN': list([stacked_unbalanced_class_best_model_downsampled_f1_train_cnn,
                                                           stacked_unbalanced_class_best_model_downsampled_f1_cv,
                                                           stacked_unbalanced_class_best_model_downsampled_f1_validation])},
                       index = set_labels)
display(f1_plot)

##################################
# Plotting all the F1 scores
# for all models
##################################
f1_plot = f1_plot.plot.barh(figsize=(10, 6), width=0.90)
f1_plot.set_xlim(0.00,1.00)
f1_plot.set_title("Classification Model Comparison by F1 Score")
f1_plot.set_xlabel("F1 Score")
f1_plot.set_ylabel("Data Set")
f1_plot.grid(False)
f1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in f1_plot.containers:
    f1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Evaluating the F1 scores
# on the test data
##################################
individual_unbalanced_class_best_model_original_f1_test = f1_score(y_test, individual_unbalanced_class_best_model_original.predict(X_test))
stacked_unbalanced_class_best_model_original_f1_test = f1_score(y_test, stacked_unbalanced_class_best_model_original.predict(X_test))
individual_balanced_class_best_model_upsampled_f1_test = f1_score(y_test, individual_balanced_class_best_model_upsampled.predict(X_test))
stacked_balanced_class_best_model_upsampled_f1_test = f1_score(y_test, stacked_balanced_class_best_model_upsampled.predict(X_test))
individual_unbalanced_class_best_model_downsampled_f1_test = f1_score(y_test, individual_unbalanced_class_best_model_downsampled.predict(X_test))
stacked_unbalanced_class_best_model_downsampled_f1_test = f1_score(y_test, stacked_unbalanced_class_best_model_downsampled.predict(X_test))

##################################
# Adding the the F1 score estimated
# from the test data
##################################
set_labels = ['Train','Cross-Validation','Validation','Test']
updated_f1_plot = pd.DataFrame({'INDIVIDUAL_ORIGINAL_TRAIN': list([individual_unbalanced_class_best_model_original_f1_train,
                                                                   individual_unbalanced_class_best_model_original_f1_cv,
                                                                   individual_unbalanced_class_best_model_original_f1_validation,
                                                                   individual_unbalanced_class_best_model_original_f1_test]),
                                'STACKED_ORIGINAL_TRAIN': list([stacked_unbalanced_class_best_model_original_f1_train,
                                                                stacked_unbalanced_class_best_model_original_f1_cv,
                                                                stacked_unbalanced_class_best_model_original_f1_validation,
                                                               stacked_unbalanced_class_best_model_original_f1_test]),
                                'INDIVIDUAL_UPSAMPLED_TRAIN': list([individual_balanced_class_best_model_upsampled_f1_train_smote,
                                                                    individual_balanced_class_best_model_upsampled_f1_cv,
                                                                    individual_balanced_class_best_model_upsampled_f1_validation,
                                                                   individual_balanced_class_best_model_upsampled_f1_test]),
                                'STACKED_UPSAMPLED_TRAIN': list([stacked_balanced_class_best_model_upsampled_f1_train_smote,
                                                                 stacked_balanced_class_best_model_upsampled_f1_cv,
                                                                 stacked_balanced_class_best_model_upsampled_f1_validation,
                                                                stacked_balanced_class_best_model_upsampled_f1_test]),
                                'INDIVIDUAL_DOWNSAMPLED_TRAIN': list([individual_unbalanced_class_best_model_downsampled_f1_train_cnn,
                                                                      individual_unbalanced_class_best_model_downsampled_f1_cv,
                                                                      individual_unbalanced_class_best_model_downsampled_f1_validation,
                                                                      individual_unbalanced_class_best_model_downsampled_f1_test]),
                                'STACKED_DOWNSAMPLED_TRAIN': list([stacked_unbalanced_class_best_model_downsampled_f1_train_cnn,
                                                                   stacked_unbalanced_class_best_model_downsampled_f1_cv,
                                                                   stacked_unbalanced_class_best_model_downsampled_f1_validation,
                                                                  stacked_unbalanced_class_best_model_downsampled_f1_test])},
                               index = set_labels)
display(updated_f1_plot)

##################################
# Plotting all the F1 scores
# for all models
##################################
updated_f1_plot = updated_f1_plot.plot.barh(figsize=(10, 8), width=0.90)
updated_f1_plot.set_xlim(0.00,1.00)
updated_f1_plot.set_title("Classification Model Comparison by F1 Score")
updated_f1_plot.set_xlabel("F1 Score")
updated_f1_plot.set_ylabel("Data Set")
updated_f1_plot.grid(False)
updated_f1_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in updated_f1_plot.containers:
    updated_f1_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

##################################
# Assigning as the final model
# the candidate model which 
# demonstrated the best performance
# on the test set
##################################
final_model = stacked_balanced_class_best_model_upsampled.named_steps['stacked_model']
final_model_base_learner = ['Stacked Model Base Learner: Decision Trees',
                            'Stacked Model Base Learner: Random Forest',
                            'Stacked Model Base Learner: Support Vector Machine']

##################################
# Defining a function to compute and plot 
# the feature importance for a defined model
##################################
def plot_feature_importance(importance, feature_names, model_name):
    indices = np.argsort(importance)
    plt.figure(figsize=(17, 8))
    plt.title(f"Feature Importance - {model_name}")
    plt.barh(range(len(importance)), importance[indices], align="center")
    plt.yticks(range(len(importance)), [feature_names[i] for i in indices])
    plt.tight_layout()
    plt.show()

##################################
# Defining the predictor names
##################################
feature_names = X_test.columns

##################################
# Ranking the predictors based on model importance
# for each base learner using feature importance
# for tree-based models like DecisionTree and Random Forest
# and coefficients for linear models like SVC with linear kernel
##################################
for index, (name, model) in enumerate(final_model.named_estimators_.items()):
    if hasattr(model, 'feature_importances_'):  # For tree-based models like DecisionTree and RandomForest
        plot_feature_importance(model.feature_importances_, feature_names, model_name=final_model_base_learner[index])
    elif hasattr(model, 'coef_'):  # For linear models like SVC with linear kernel
        importance = np.abs(model.coef_).flatten()
        plot_feature_importance(importance, feature_names, model_name=final_model_base_learner[index])

##################################
# Generating predictions from the 
# base learners to be used as input
# to the logistic regression meta-learner
##################################
base_learners_predictions = []
for name, model in final_model.named_estimators_.items():
    base_learners_predictions.append(model.predict_proba(X_test)[:, 1])

##################################
# Stacking the base learners' predictions
##################################
meta_input = np.column_stack(base_learners_predictions)

##################################
# Defining the base learner model names
##################################
meta_feature_names = [f'Model Prediction - {x}' for x in final_model_base_learner]

##################################
# Ranking the predictors based on model importance
# for each meta-learner using coefficients
# for linear models like logistic regression
##################################
if hasattr(final_model.final_estimator_, 'coef_'):
    importance = np.abs(final_model.final_estimator_.coef_).flatten()
    plot_feature_importance(importance, meta_feature_names, model_name='Stacked Model Meta-Learner: Logistic Regression')

##################################
# Rebuilding the upsampled training data
# for plotting categorical distributions
##################################
lung_cancer_train_smote = pd.concat([X_train_smote, y_train_smote], axis=1)
lung_cancer_train_smote.iloc[:,0:10] = lung_cancer_train_smote.iloc[:,0:10].replace({0: 'Absent', 1: 'Present'})
lung_cancer_train_smote['LUNG_CANCER'] = lung_cancer_train_smote['LUNG_CANCER'].astype('category')
lung_cancer_train_smote['LUNG_CANCER'] = lung_cancer_train_smote['LUNG_CANCER'].cat.rename_categories({0: 'No', 1: 'Yes'})
lung_cancer_train_smote[lung_cancer_train_smote.columns[0:11]] = lung_cancer_train_smote[lung_cancer_train_smote.columns[0:11]].astype('category')
lung_cancer_train_smote.head()

##################################
# Plotting the categorical distributions
# for a low-risk test case
##################################
fig, axs = plt.subplots(2, 5, figsize=(17, 8))

colors = ['blue','red']
level_order = ['Absent','Present']

sns.countplot(x='YELLOW_FINGERS', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 0], order=level_order, palette=colors)
axs[0, 0].set_title('YELLOW_FINGERS')
axs[0, 0].set_ylabel('Classification Model Training Case Count')
axs[0, 0].set_xlabel(None)
axs[0, 0].set_ylim(0, 200)
axs[0, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ANXIETY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 1], order=level_order, palette=colors)
axs[0, 1].set_title('ANXIETY')
axs[0, 1].set_ylabel('Classification Model Training Case Count')
axs[0, 1].set_xlabel(None)
axs[0, 1].set_ylim(0, 200)
axs[0, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='PEER_PRESSURE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 2], order=level_order, palette=colors)
axs[0, 2].set_title('PEER_PRESSURE')
axs[0, 2].set_ylabel('Classification Model Training Case Count')
axs[0, 2].set_xlabel(None)
axs[0, 2].set_ylim(0, 200)
axs[0, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='FATIGUE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 3], order=level_order, palette=colors)
axs[0, 3].set_title('FATIGUE')
axs[0, 3].set_ylabel('Classification Model Training Case Count')
axs[0, 3].set_xlabel(None)
axs[0, 3].set_ylim(0, 200)
axs[0, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALLERGY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 4], order=level_order, palette=colors)
axs[0, 4].set_title('ALLERGY')
axs[0, 4].set_ylabel('Classification Model Training Case Count')
axs[0, 4].set_xlabel(None)
axs[0, 4].set_ylim(0, 200)
axs[0, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='WHEEZING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 0], order=level_order, palette=colors)
axs[1, 0].set_title('WHEEZING')
axs[1, 0].set_ylabel('Classification Model Training Case Count')
axs[1, 0].set_xlabel(None)
axs[1, 0].set_ylim(0, 200)
axs[1, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALCOHOL_CONSUMING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 1], order=level_order, palette=colors)
axs[1, 1].set_title('ALCOHOL_CONSUMING')
axs[1, 1].set_ylabel('Classification Model Training Case Count')
axs[1, 1].set_xlabel(None)
axs[1, 1].set_ylim(0, 200)
axs[1, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='COUGHING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 2], order=level_order, palette=colors)
axs[1, 2].set_title('COUGHING')
axs[1, 2].set_ylabel('Classification Model Training Case Count')
axs[1, 2].set_xlabel(None)
axs[1, 2].set_ylim(0, 200)
axs[1, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='SWALLOWING_DIFFICULTY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 3], order=level_order, palette=colors)
axs[1, 3].set_title('SWALLOWING_DIFFICULTY')
axs[1, 3].set_ylabel('Classification Model Training Case Count')
axs[1, 3].set_xlabel(None)
axs[1, 3].set_ylim(0, 200)
axs[1, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='CHEST_PAIN', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 4], order=level_order, palette=colors)
axs[1, 4].set_title('CHEST_PAIN')
axs[1, 4].set_ylabel('Classification Model Training Case Count')
axs[1, 4].set_xlabel(None)
axs[1, 4].set_ylim(0, 200)
axs[1, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

plt.tight_layout()
plt.show()

##################################
# Plotting the estimated logistic curve
# of the final classification model
# involving a stacked model with
# a logistic regression meta-learner
# and random forest, SVC and decision tree
# base learners
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_balanced_class_best_model_upsampled_logit_values_sorted, 
         stacked_balanced_class_best_model_upsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-6.00, 6.00)
target_0_indices = y_train_smote == 0
target_1_indices = y_train_smote == 1
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_0_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.40, s=100, marker= 'o', edgecolor='k', label='LUNG_CANCER=NO')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_1_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_1_indices], 
            color='red', alpha=0.40, s=100, marker='o', edgecolor='k', label='LUNG_CANCER=YES')
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.title('Final Classification Model: Stacked Model (Meta-Learner = Logistic Regression, Base Learners: Random Forest, Support Vector Classifier, Decision Tree)')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(True)
plt.legend(loc='upper left')
plt.show()

##################################
# Describing the details of a 
# low-risk test case
##################################
X_sample = {"YELLOW_FINGERS":1,
            "ANXIETY":0,
            "PEER_PRESSURE":0,
            "FATIGUE":0,
            "ALLERGY":0,
            "WHEEZING":1,
            "ALCOHOL_CONSUMING":0,
            "COUGHING":0,
            "SWALLOWING_DIFFICULTY":1,
            "CHEST_PAIN":1}
X_test_sample = pd.DataFrame([X_sample])
X_test_sample.head()

##################################
# Rebuilding the low-risk test case data
# for plotting categorical distributions
##################################
X_test_sample_category = X_test_sample.copy()
int_test_columns = X_test_sample_category.columns
X_test_sample_category[int_test_columns] = X_test_sample_category[int_test_columns].astype(object)
X_test_sample_category[int_test_columns] = X_test_sample_category[int_test_columns].replace({0: 'Absent', 1: 'Present'})
X_test_sample_category.head()

##################################
# Plotting the categorical distributions
# for a low-risk test case
##################################
fig, axs = plt.subplots(2, 5, figsize=(17, 8))

colors = ['blue','red']
level_order = ['Absent','Present']

sns.countplot(x='YELLOW_FINGERS', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 0], order=level_order, palette=colors)
axs[0, 0].axvline(level_order.index(X_test_sample_category['YELLOW_FINGERS'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 0].set_title('YELLOW_FINGERS')
axs[0, 0].set_ylabel('Classification Model Training Case Count')
axs[0, 0].set_xlabel(None)
axs[0, 0].set_ylim(0, 200)
axs[0, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ANXIETY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 1], order=level_order, palette=colors)
axs[0, 1].axvline(level_order.index(X_test_sample_category['ANXIETY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 1].set_title('ANXIETY')
axs[0, 1].set_ylabel('Classification Model Training Case Count')
axs[0, 1].set_xlabel(None)
axs[0, 1].set_ylim(0, 200)
axs[0, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='PEER_PRESSURE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 2], order=level_order, palette=colors)
axs[0, 2].axvline(level_order.index(X_test_sample_category['PEER_PRESSURE'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 2].set_title('PEER_PRESSURE')
axs[0, 2].set_ylabel('Classification Model Training Case Count')
axs[0, 2].set_xlabel(None)
axs[0, 2].set_ylim(0, 200)
axs[0, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='FATIGUE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 3], order=level_order, palette=colors)
axs[0, 3].axvline(level_order.index(X_test_sample_category['FATIGUE'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 3].set_title('FATIGUE')
axs[0, 3].set_ylabel('Classification Model Training Case Count')
axs[0, 3].set_xlabel(None)
axs[0, 3].set_ylim(0, 200)
axs[0, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALLERGY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 4], order=level_order, palette=colors)
axs[0, 4].axvline(level_order.index(X_test_sample_category['ALLERGY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 4].set_title('ALLERGY')
axs[0, 4].set_ylabel('Classification Model Training Case Count')
axs[0, 4].set_xlabel(None)
axs[0, 4].set_ylim(0, 200)
axs[0, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='WHEEZING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 0], order=level_order, palette=colors)
axs[1, 0].axvline(level_order.index(X_test_sample_category['WHEEZING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 0].set_title('WHEEZING')
axs[1, 0].set_ylabel('Classification Model Training Case Count')
axs[1, 0].set_xlabel(None)
axs[1, 0].set_ylim(0, 200)
axs[1, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALCOHOL_CONSUMING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 1], order=level_order, palette=colors)
axs[1, 1].axvline(level_order.index(X_test_sample_category['ALCOHOL_CONSUMING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 1].set_title('ALCOHOL_CONSUMING')
axs[1, 1].set_ylabel('Classification Model Training Case Count')
axs[1, 1].set_xlabel(None)
axs[1, 1].set_ylim(0, 200)
axs[1, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='COUGHING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 2], order=level_order, palette=colors)
axs[1, 2].axvline(level_order.index(X_test_sample_category['COUGHING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 2].set_title('COUGHING')
axs[1, 2].set_ylabel('Classification Model Training Case Count')
axs[1, 2].set_xlabel(None)
axs[1, 2].set_ylim(0, 200)
axs[1, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='SWALLOWING_DIFFICULTY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 3], order=level_order, palette=colors)
axs[1, 3].axvline(level_order.index(X_test_sample_category['SWALLOWING_DIFFICULTY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 3].set_title('SWALLOWING_DIFFICULTY')
axs[1, 3].set_ylabel('Classification Model Training Case Count')
axs[1, 3].set_xlabel(None)
axs[1, 3].set_ylim(0, 200)
axs[1, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='CHEST_PAIN', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 4], order=level_order, palette=colors)
axs[1, 4].axvline(level_order.index(X_test_sample_category['CHEST_PAIN'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 4].set_title('CHEST_PAIN')
axs[1, 4].set_ylabel('Classification Model Training Case Count')
axs[1, 4].set_xlabel(None)
axs[1, 4].set_ylim(0, 200)
axs[1, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

plt.tight_layout()
plt.show()

##################################
# Computing the logit and estimated probability
# for the test case
##################################
X_sample_logit = stacked_balanced_class_best_model_upsampled.decision_function(X_test_sample)[0]
X_sample_probability = stacked_balanced_class_best_model_upsampled.predict_proba(X_test_sample)[0, 1]
X_sample_class = "Low-Risk" if X_sample_probability < 0.50 else "High-Risk"
print(f"Test Case Risk Index: {X_sample_logit}")
print(f"Test Case Probability: {X_sample_probability}")
print(f"Test Case Risk Category: {X_sample_class}")

Test Case Risk Index: -1.2117837409390746
Test Case Probability: 0.22938559072691203
Test Case Risk Category: Low-Risk

##################################
# Plotting the logit and estimated probability
# for the low-risk test case 
# in the estimated logistic curve
# of the final classification model
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_balanced_class_best_model_upsampled_logit_values_sorted, 
         stacked_balanced_class_best_model_upsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-6.00, 6.00)
target_0_indices = y_train_smote == 0
target_1_indices = y_train_smote == 1
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_0_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.20, s=100, marker= 'o', edgecolor='k', label='Classification Model Training Cases: LUNG_CANCER = No')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_1_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_1_indices], 
            color='red', alpha=0.20, s=100, marker='o', edgecolor='k', label='Classification Model Training Cases: LUNG_CANCER = Yes')
if X_sample_class == "Low-Risk":
    plt.scatter(X_sample_logit, X_sample_probability, color='blue', s=125, edgecolor='k', label='Test Case (Low-Risk)', marker= 's', zorder=5)
    plt.axvline(X_sample_logit, color='black', linestyle='--', linewidth=3)
    plt.axhline(X_sample_probability, color='black', linestyle='--', linewidth=3)
if X_sample_class == "High-Risk":
    plt.scatter(X_sample_logit, X_sample_probability, color='red', s=125, edgecolor='k', label='Test Case (High-Risk)', marker= 's', zorder=5)
    plt.axvline(X_sample_logit, color='black', linestyle='--', linewidth=3)
    plt.axhline(X_sample_probability, color='black', linestyle='--', linewidth=3)
plt.title('Final Classification Model: Stacked Model (Meta-Learner = Logistic Regression, Base Learners = Random Forest, Support Vector Classifier, Decision Tree)')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(False)
plt.legend(facecolor='white', framealpha=1, loc='upper center', bbox_to_anchor=(0.5, -0.10), ncol=3)
plt.tight_layout(rect=[0, 0, 1.00, 0.95])
plt.show()

##################################
# Describing the details of a 
# high-risk test case
##################################
X_sample = {"YELLOW_FINGERS":1,
            "ANXIETY":0,
            "PEER_PRESSURE":1,
            "FATIGUE":0,
            "ALLERGY":1,
            "WHEEZING":1,
            "ALCOHOL_CONSUMING":0,
            "COUGHING":1,
            "SWALLOWING_DIFFICULTY":1,
            "CHEST_PAIN":1}
X_test_sample = pd.DataFrame([X_sample])
X_test_sample.head()

##################################
# Rebuilding the high-risk test case data
# for plotting categorical distributions
##################################
X_test_sample_category = X_test_sample.copy()
int_test_columns = X_test_sample_category.columns
X_test_sample_category[int_test_columns] = X_test_sample_category[int_test_columns].astype(object)
X_test_sample_category[int_test_columns] = X_test_sample_category[int_test_columns].replace({0: 'Absent', 1: 'Present'})
X_test_sample_category.head()

##################################
# Plotting the categorical distributions
# for a low-risk test case
##################################
fig, axs = plt.subplots(2, 5, figsize=(17, 8))

colors = ['blue','red']
level_order = ['Absent','Present']

sns.countplot(x='YELLOW_FINGERS', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 0], order=level_order, palette=colors)
axs[0, 0].axvline(level_order.index(X_test_sample_category['YELLOW_FINGERS'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 0].set_title('YELLOW_FINGERS')
axs[0, 0].set_ylabel('Classification Model Training Case Count')
axs[0, 0].set_xlabel(None)
axs[0, 0].set_ylim(0, 200)
axs[0, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ANXIETY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 1], order=level_order, palette=colors)
axs[0, 1].axvline(level_order.index(X_test_sample_category['ANXIETY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 1].set_title('ANXIETY')
axs[0, 1].set_ylabel('Classification Model Training Case Count')
axs[0, 1].set_xlabel(None)
axs[0, 1].set_ylim(0, 200)
axs[0, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='PEER_PRESSURE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 2], order=level_order, palette=colors)
axs[0, 2].axvline(level_order.index(X_test_sample_category['PEER_PRESSURE'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 2].set_title('PEER_PRESSURE')
axs[0, 2].set_ylabel('Classification Model Training Case Count')
axs[0, 2].set_xlabel(None)
axs[0, 2].set_ylim(0, 200)
axs[0, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='FATIGUE', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 3], order=level_order, palette=colors)
axs[0, 3].axvline(level_order.index(X_test_sample_category['FATIGUE'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 3].set_title('FATIGUE')
axs[0, 3].set_ylabel('Classification Model Training Case Count')
axs[0, 3].set_xlabel(None)
axs[0, 3].set_ylim(0, 200)
axs[0, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALLERGY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[0, 4], order=level_order, palette=colors)
axs[0, 4].axvline(level_order.index(X_test_sample_category['ALLERGY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[0, 4].set_title('ALLERGY')
axs[0, 4].set_ylabel('Classification Model Training Case Count')
axs[0, 4].set_xlabel(None)
axs[0, 4].set_ylim(0, 200)
axs[0, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[0, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='WHEEZING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 0], order=level_order, palette=colors)
axs[1, 0].axvline(level_order.index(X_test_sample_category['WHEEZING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 0].set_title('WHEEZING')
axs[1, 0].set_ylabel('Classification Model Training Case Count')
axs[1, 0].set_xlabel(None)
axs[1, 0].set_ylim(0, 200)
axs[1, 0].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 0].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='ALCOHOL_CONSUMING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 1], order=level_order, palette=colors)
axs[1, 1].axvline(level_order.index(X_test_sample_category['ALCOHOL_CONSUMING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 1].set_title('ALCOHOL_CONSUMING')
axs[1, 1].set_ylabel('Classification Model Training Case Count')
axs[1, 1].set_xlabel(None)
axs[1, 1].set_ylim(0, 200)
axs[1, 1].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 1].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='COUGHING', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 2], order=level_order, palette=colors)
axs[1, 2].axvline(level_order.index(X_test_sample_category['COUGHING'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 2].set_title('COUGHING')
axs[1, 2].set_ylabel('Classification Model Training Case Count')
axs[1, 2].set_xlabel(None)
axs[1, 2].set_ylim(0, 200)
axs[1, 2].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 2].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='SWALLOWING_DIFFICULTY', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 3], order=level_order, palette=colors)
axs[1, 3].axvline(level_order.index(X_test_sample_category['SWALLOWING_DIFFICULTY'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 3].set_title('SWALLOWING_DIFFICULTY')
axs[1, 3].set_ylabel('Classification Model Training Case Count')
axs[1, 3].set_xlabel(None)
axs[1, 3].set_ylim(0, 200)
axs[1, 3].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 3].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

sns.countplot(x='CHEST_PAIN', hue='LUNG_CANCER', data=lung_cancer_train_smote, ax=axs[1, 4], order=level_order, palette=colors)
axs[1, 4].axvline(level_order.index(X_test_sample_category['CHEST_PAIN'].iloc[0]), color='black', linestyle='--', linewidth=3)
axs[1, 4].set_title('CHEST_PAIN')
axs[1, 4].set_ylabel('Classification Model Training Case Count')
axs[1, 4].set_xlabel(None)
axs[1, 4].set_ylim(0, 200)
axs[1, 4].legend(title='LUNG_CANCER', loc='upper center')
for patch, color in zip(axs[1, 4].patches, ['blue','blue','red','red'] ):
    patch.set_facecolor(color)
    patch.set_alpha(0.2)

plt.tight_layout()
plt.show()

##################################
# Computing the logit and estimated probability
# for a high-risk test case
##################################
X_sample_logit = stacked_balanced_class_best_model_upsampled.decision_function(X_test_sample)[0]
X_sample_probability = stacked_balanced_class_best_model_upsampled.predict_proba(X_test_sample)[0, 1]
X_sample_class = "Low-Risk" if X_sample_probability < 0.50 else "High-Risk"
print(f"Test Case Risk Index: {X_sample_logit}")
print(f"Test Case Probability: {X_sample_probability}")
print(f"Test Case Risk Category: {X_sample_class}")

Test Case Risk Index: 3.4784950973590973
Test Case Probability: 0.9700696569701589
Test Case Risk Category: High-Risk

##################################
# Plotting the logit and estimated probability
# for the high-risk test case 
# in the estimated logistic curve
# of the final classification model
##################################
plt.figure(figsize=(17, 8))
plt.plot(stacked_balanced_class_best_model_upsampled_logit_values_sorted, 
         stacked_balanced_class_best_model_upsampled_probabilities_sorted, label='Classification Model Logistic Curve', color='black')
plt.ylim(-0.05, 1.05)
plt.xlim(-6.00, 6.00)
target_0_indices = y_train_smote == 0
target_1_indices = y_train_smote == 1
plt.axhline(0.5, color='green', linestyle='--', label='Classification Model Threshold')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_0_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_0_indices], 
            color='blue', alpha=0.20, s=100, marker= 'o', edgecolor='k', label='Classification Model Training Cases: LUNG_CANCER = No')
plt.scatter(stacked_balanced_class_best_model_upsampled_logit_values[target_1_indices], 
            stacked_balanced_class_best_model_upsampled_probabilities[target_1_indices], 
            color='red', alpha=0.20, s=100, marker='o', edgecolor='k', label='Classification Model Training Cases: LUNG_CANCER = Yes')
if X_sample_class == "Low-Risk":
    plt.scatter(X_sample_logit, X_sample_probability, color='blue', s=125, edgecolor='k', label='Test Case (Low-Risk)', marker= 's', zorder=5)
    plt.axvline(X_sample_logit, color='black', linestyle='--', linewidth=3)
    plt.axhline(X_sample_probability, color='black', linestyle='--', linewidth=3)
if X_sample_class == "High-Risk":
    plt.scatter(X_sample_logit, X_sample_probability, color='red', s=125, edgecolor='k', label='Test Case (High-Risk)', marker= 's', zorder=5)
    plt.axvline(X_sample_logit, color='black', linestyle='--', linewidth=3)
    plt.axhline(X_sample_probability, color='black', linestyle='--', linewidth=3)
plt.title('Final Classification Model: Stacked Model (Meta-Learner = Logistic Regression, Base Learners = Random Forest, Support Vector Classifier, Decision Tree)')
plt.xlabel('Logit (Log-Odds)')
plt.ylabel('Estimated Lung Cancer Probability')
plt.grid(False)
plt.legend(facecolor='white', framealpha=1, loc='upper center', bbox_to_anchor=(0.5, -0.10), ncol=3)
plt.tight_layout(rect=[0, 0, 1.00, 0.95])
plt.show()

from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))

	ChiSquare.Test.Statistic	ChiSquare.Test.PValue
LUNG_CANCER_ALLERGY	31.238952	2.281422e-08
LUNG_CANCER_ALCOHOL_CONSUMING	24.005406	9.606559e-07
LUNG_CANCER_SWALLOWING_DIFFICULTY	19.307277	1.112814e-05
LUNG_CANCER_WHEEZING	17.723096	2.555055e-05
LUNG_CANCER_COUGHING	17.606122	2.717123e-05
LUNG_CANCER_CHEST_PAIN	10.083198	1.496275e-03
LUNG_CANCER_PEER_PRESSURE	9.641594	1.902201e-03
LUNG_CANCER_YELLOW_FINGERS	9.088186	2.572659e-03
LUNG_CANCER_FATIGUE	6.081100	1.366356e-02
LUNG_CANCER_ANXIETY	5.648390	1.747141e-02
LUNG_CANCER_CHRONIC_DISEASE	3.161200	7.540772e-02
LUNG_CANCER_GENDER	1.021545	3.121527e-01
LUNG_CANCER_SHORTNESS_OF_BREATH	0.790604	3.739175e-01
LUNG_CANCER_SMOKING	0.722513	3.953209e-01

	INDIVIDUAL_ORIGINAL_TRAIN	STACKED_ORIGINAL_TRAIN	INDIVIDUAL_UPSAMPLED_TRAIN	STACKED_UPSAMPLED_TRAIN	INDIVIDUAL_DOWNSAMPLED_TRAIN	STACKED_DOWNSAMPLED_TRAIN
Train	0.930556	0.940351	0.912162	0.956811	0.853333	0.821918
Cross-Validation	0.911574	0.912498	0.910870	0.948878	0.753711	0.753114
Validation	0.949495	0.914894	0.927835	0.961538	0.970874	0.952381

	INDIVIDUAL_ORIGINAL_TRAIN	STACKED_ORIGINAL_TRAIN	INDIVIDUAL_UPSAMPLED_TRAIN	STACKED_UPSAMPLED_TRAIN	INDIVIDUAL_DOWNSAMPLED_TRAIN	STACKED_DOWNSAMPLED_TRAIN
Train	0.930556	0.940351	0.912162	0.956811	0.853333	0.821918
Cross-Validation	0.911574	0.912498	0.910870	0.948878	0.753711	0.753114
Validation	0.949495	0.914894	0.927835	0.961538	0.970874	0.952381
Test	0.904762	0.878049	0.890625	0.948905	0.939394	0.916031

Model Deployment : Estimating Lung Cancer Probabilities From Demographic Factors, Clinical Symptoms And Behavioral Indicators¶

John Pauline Pineda

August 31, 2024

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Predictive Model Development ¶

1.6.1 Pre-Modelling Data Preparation ¶

1.6.2 Data Splitting ¶

1.6.3 Modelling Pipeline Development ¶

1.6.3.1 Individual Classifier ¶

1.6.3.2 Stacked Classifier ¶

1.6.4 Model Fitting using Original Training Data | Hyperparameter Tuning | Validation ¶

1.6.4.1 Individual Classifier ¶

1.6.4.2 Stacked Classifier ¶

1.6.5 Model Fitting using Upsampled Training Data | Hyperparameter Tuning | Validation ¶

1.6.5.1 Individual Classifier ¶

1.6.5.2 Stacked Classifier ¶

1.6.6 Model Fitting using Downsampled Training Data | Hyperparameter Tuning | Validation ¶

1.6.6.1 Individual Classifier ¶

1.6.6.2 Stacked Classifier ¶

1.6.7 Model Selection ¶

1.6.8 Model Testing ¶

1.6.9 Model Inference ¶

1.7. Predictive Model Deployment Using Streamlit and Streamlit Community Cloud ¶

1.7.1 Model Prediction Application Code Development ¶

1.7.2 User Interface Application Code Development ¶

1.7.3 Web Application ¶

2. Summary ¶

3. References ¶

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	M	69	1	2	2	1	1	2	1	2	2	2	2	2	2	YES
1	M	74	2	1	1	1	2	2	2	1	1	1	2	2	2	YES
2	F	59	1	1	1	2	1	2	1	2	1	2	2	1	2	NO
3	M	63	2	2	2	1	1	1	1	1	2	1	1	2	2	NO
4	F	63	1	2	1	1	1	1	1	2	1	2	2	1	1	NO

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	M	69	Absent	Present	Present	Absent	Absent	Present	Absent	Present	Present	Present	Present	Present	Present	YES
1	M	74	Present	Absent	Absent	Absent	Present	Present	Present	Absent	Absent	Absent	Present	Present	Present	YES
2	F	59	Absent	Absent	Absent	Present	Absent	Present	Absent	Present	Absent	Present	Present	Absent	Present	NO
3	M	63	Present	Present	Present	Absent	Absent	Absent	Absent	Absent	Present	Absent	Absent	Present	Present	NO
4	F	63	Absent	Present	Absent	Absent	Absent	Absent	Absent	Present	Absent	Present	Present	Absent	Absent	NO

	count	unique	top	freq
GENDER	309	2	M	162
SMOKING	309	2	Present	174
YELLOW_FINGERS	309	2	Present	176
ANXIETY	309	2	Absent	155
PEER_PRESSURE	309	2	Present	155
CHRONIC_DISEASE	309	2	Present	156
FATIGUE	309	2	Present	208
ALLERGY	309	2	Present	172
WHEEZING	309	2	Present	172
ALCOHOL_CONSUMING	309	2	Present	172
COUGHING	309	2	Present	179
SHORTNESS_OF_BREATH	309	2	Present	198
SWALLOWING_DIFFICULTY	309	2	Absent	164
CHEST_PAIN	309	2	Present	172
LUNG_CANCER	309	2	YES	270

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Fill.Rate
0	GENDER	category	309	309	1.0
1	AGE	int64	309	309	1.0
2	SMOKING	object	309	309	1.0
3	YELLOW_FINGERS	object	309	309	1.0
4	ANXIETY	object	309	309	1.0
5	PEER_PRESSURE	object	309	309	1.0
6	CHRONIC_DISEASE	object	309	309	1.0
7	FATIGUE	object	309	309	1.0
8	ALLERGY	object	309	309	1.0
9	WHEEZING	object	309	309	1.0
10	ALCOHOL_CONSUMING	object	309	309	1.0
11	COUGHING	object	309	309	1.0
12	SHORTNESS_OF_BREATH	object	309	309	1.0
13	SWALLOWING_DIFFICULTY	object	309	309	1.0
14	CHEST_PAIN	object	309	309	1.0
15	LUNG_CANCER	object	309	309	1.0

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	16	0	0.0
1	1	16	0	0.0
2	2	16	0	0.0
3	3	16	0	0.0
4	4	16	0	0.0
...	...	...	...	...
304	304	16	0	0.0
305	305	16	0	0.0
306	306	16	0	0.0
307	307	16	0	0.0
308	308	16	0	0.0

	Categorical.Column.Name	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio
0	GENDER	M	F	162	147	1.102041	2	309	0.006472
1	SMOKING	Present	Absent	174	135	1.288889	2	309	0.006472
2	YELLOW_FINGERS	Present	Absent	176	133	1.323308	2	309	0.006472
3	ANXIETY	Absent	Present	155	154	1.006494	2	309	0.006472
4	PEER_PRESSURE	Present	Absent	155	154	1.006494	2	309	0.006472
5	CHRONIC_DISEASE	Present	Absent	156	153	1.019608	2	309	0.006472
6	FATIGUE	Present	Absent	208	101	2.059406	2	309	0.006472
7	ALLERGY	Present	Absent	172	137	1.255474	2	309	0.006472
8	WHEEZING	Present	Absent	172	137	1.255474	2	309	0.006472
9	ALCOHOL_CONSUMING	Present	Absent	172	137	1.255474	2	309	0.006472
10	COUGHING	Present	Absent	179	130	1.376923	2	309	0.006472
11	SHORTNESS_OF_BREATH	Present	Absent	198	111	1.783784	2	309	0.006472
12	SWALLOWING_DIFFICULTY	Absent	Present	164	145	1.131034	2	309	0.006472
13	CHEST_PAIN	Present	Absent	172	137	1.255474	2	309	0.006472
14	LUNG_CANCER	YES	NO	270	39	6.923077	2	309	0.006472

	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN
0	69.0	0	1	1	0	0	1	0	1	1	1	1	1	1
1	74.0	1	0	0	0	1	1	1	0	0	0	1	1	1
2	59.0	0	0	0	1	0	1	0	1	0	1	1	0	1
3	63.0	1	1	1	0	0	0	0	0	1	0	0	1	1
4	63.0	0	1	0	0	0	0	0	1	0	1	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	56.0	0	0	0	1	1	1	0	0	1	1	1	1	0
305	70.0	1	0	0	0	0	1	1	1	1	1	1	0	1
306	58.0	1	0	0	0	0	0	1	1	1	1	0	0	1
307	67.0	1	0	1	0	0	1	1	0	1	1	1	0	1
308	62.0	0	0	0	1	0	1	1	1	1	0	0	1	0

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	69	0	1	1	0	0	1	0	1	1	1	1	1	1	1
1	1	74	1	0	0	0	1	1	1	0	0	0	1	1	1	1
2	0	59	0	0	0	1	0	1	0	1	0	1	1	0	1	0
3	1	63	1	1	1	0	0	0	0	0	1	0	0	1	1	0
4	0	63	0	1	0	0	0	0	0	1	0	1	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	0	56	0	0	0	1	1	1	0	0	1	1	1	1	0	1
305	1	70	1	0	0	0	0	1	1	1	1	1	1	0	1	1
306	1	58	1	0	0	0	0	0	1	1	1	1	0	0	1	1
307	1	67	1	0	1	0	0	1	1	0	1	1	1	0	1	1
308	1	62	0	0	0	1	0	1	1	1	1	0	0	1	0	1

	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	1	0	1	0	1	1	1	1	1	1
1	0	0	0	1	1	0	0	0	1	1	1
2	0	0	1	1	0	1	0	1	0	1	0
3	1	1	0	0	0	0	1	0	1	1	0
4	1	0	0	0	0	1	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...
304	0	0	1	1	0	0	1	1	1	0	1
305	0	0	0	1	1	1	1	1	0	1	1
306	0	0	0	0	1	1	1	1	0	1	1
307	0	1	0	1	1	0	1	1	0	1	1
308	0	0	1	1	1	1	1	0	1	0	1

	LUNG_CANCER	Count	Percentage
0	0	39	12.621359
1	1	270	87.378641

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	M	69	1	2	2	1	1	2	1	2	2	2	2	2	2	YES
1	M	74	2	1	1	1	2	2	2	1	1	1	2	2	2	YES
2	F	59	1	1	1	2	1	2	1	2	1	2	2	1	2	NO
3	M	63	2	2	2	1	1	1	1	1	2	1	1	2	2	NO
4	F	63	1	2	1	1	1	1	1	2	1	2	2	1	1	NO

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	16	0	0.0
1	1	16	0	0.0
2	2	16	0	0.0
3	3	16	0	0.0
4	4	16	0	0.0
...	...	...	...	...
304	304	16	0	0.0
305	305	16	0	0.0
306	306	16	0	0.0
307	307	16	0	0.0
308	308	16	0	0.0

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	69	0	1	1	0	0	1	0	1	1	1	1	1	1	1
1	1	74	1	0	0	0	1	1	1	0	0	0	1	1	1	1
2	0	59	0	0	0	1	0	1	0	1	0	1	1	0	1	0
3	1	63	1	1	1	0	0	0	0	0	1	0	0	1	1	0
4	0	63	0	1	0	0	0	0	0	1	0	1	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	0	56	0	0	0	1	1	1	0	0	1	1	1	1	0	1
305	1	70	1	0	0	0	0	1	1	1	1	1	1	0	1	1
306	1	58	1	0	0	0	0	0	1	1	1	1	0	0	1	1
307	1	67	1	0	1	0	0	1	1	0	1	1	1	0	1	1
308	1	62	0	0	0	1	0	1	1	1	1	0	0	1	0	1

	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	1	0	1	0	1	1	1	1	1	1
1	0	0	0	1	1	0	0	0	1	1	1
2	0	0	1	1	0	1	0	1	0	1	0
3	1	1	0	0	0	0	1	0	1	1	0
4	1	0	0	0	0	1	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...
304	0	0	1	1	0	0	1	1	1	0	1
305	0	0	0	1	1	1	1	1	0	1	1
306	0	0	0	0	1	1	1	1	0	1	1
307	0	1	0	1	1	0	1	1	0	1	1
308	0	0	1	1	1	1	1	0	1	0	1

Model Deployment : Estimating Lung Cancer Probabilities From Demographic Factors, Clinical Symptoms And Behavioral Indicators¶

John Pauline Pineda August 31, 2024

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Predictive Model Development ¶

1.6.1 Pre-Modelling Data Preparation ¶

1.6.2 Data Splitting ¶

1.6.3 Modelling Pipeline Development ¶

1.6.3.1 Individual Classifier ¶

1.6.3.2 Stacked Classifier ¶

1.6.4 Model Fitting using Original Training Data | Hyperparameter Tuning | Validation ¶

1.6.4.1 Individual Classifier ¶

1.6.4.2 Stacked Classifier ¶

1.6.5 Model Fitting using Upsampled Training Data | Hyperparameter Tuning | Validation ¶

1.6.5.1 Individual Classifier ¶

1.6.5.2 Stacked Classifier ¶

1.6.6 Model Fitting using Downsampled Training Data | Hyperparameter Tuning | Validation ¶

1.6.6.1 Individual Classifier ¶

1.6.6.2 Stacked Classifier ¶

1.6.7 Model Selection ¶

1.6.8 Model Testing ¶

1.6.9 Model Inference ¶

1.7. Predictive Model Deployment Using Streamlit and Streamlit Community Cloud ¶

1.7.1 Model Prediction Application Code Development ¶

1.7.2 User Interface Application Code Development ¶

1.7.3 Web Application ¶

2. Summary ¶

3. References ¶

John Pauline Pineda

August 31, 2024

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	M	69	1	2	2	1	1	2	1	2	2	2	2	2	2	YES
1	M	74	2	1	1	1	2	2	2	1	1	1	2	2	2	YES
2	F	59	1	1	1	2	1	2	1	2	1	2	2	1	2	NO
3	M	63	2	2	2	1	1	1	1	1	2	1	1	2	2	NO
4	F	63	1	2	1	1	1	1	1	2	1	2	2	1	1	NO

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	0	16	0	0.0
1	1	16	0	0.0
2	2	16	0	0.0
3	3	16	0	0.0
4	4	16	0	0.0
...	...	...	...	...
304	304	16	0	0.0
305	305	16	0	0.0
306	306	16	0	0.0
307	307	16	0	0.0
308	308	16	0	0.0

	GENDER	AGE	SMOKING	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	CHRONIC_DISEASE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SHORTNESS_OF_BREATH	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	69	0	1	1	0	0	1	0	1	1	1	1	1	1	1
1	1	74	1	0	0	0	1	1	1	0	0	0	1	1	1	1
2	0	59	0	0	0	1	0	1	0	1	0	1	1	0	1	0
3	1	63	1	1	1	0	0	0	0	0	1	0	0	1	1	0
4	0	63	0	1	0	0	0	0	0	1	0	1	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	0	56	0	0	0	1	1	1	0	0	1	1	1	1	0	1
305	1	70	1	0	0	0	0	1	1	1	1	1	1	0	1	1
306	1	58	1	0	0	0	0	0	1	1	1	1	0	0	1	1
307	1	67	1	0	1	0	0	1	1	0	1	1	1	0	1	1
308	1	62	0	0	0	1	0	1	1	1	1	0	0	1	0	1

	YELLOW_FINGERS	ANXIETY	PEER_PRESSURE	FATIGUE	ALLERGY	WHEEZING	ALCOHOL_CONSUMING	COUGHING	SWALLOWING_DIFFICULTY	CHEST_PAIN	LUNG_CANCER
0	1	1	0	1	0	1	1	1	1	1	1
1	0	0	0	1	1	0	0	0	1	1	1
2	0	0	1	1	0	1	0	1	0	1	0
3	1	1	0	0	0	0	1	0	1	1	0
4	1	0	0	0	0	1	0	1	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...
304	0	0	1	1	0	0	1	1	1	0	1
305	0	0	0	1	1	1	1	1	0	1	1
306	0	0	0	0	1	1	1	1	0	1	1
307	0	1	0	1	1	0	1	1	0	1	1
308	0	0	1	1	1	1	1	0	1	0	1