##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
import pickle
%matplotlib inline

import hashlib
import json
from urllib.parse import urlparse
import logging

from operator import truediv
from sklearn.preprocessing import OrdinalEncoder
from scipy import stats
from scipy.stats import pointbiserialr, chi2_contingency

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedShuffleSplit
from sklearn.base import clone

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"
DATASETS_FINAL_PATH = r"datasets\final\complete"
DATASETS_FINAL_TRAIN_PATH = r"datasets\final\train"
DATASETS_FINAL_TRAIN_FEATURES_PATH = r"datasets\final\train\features"
DATASETS_FINAL_TRAIN_TARGET_PATH = r"datasets\final\train\target"
DATASETS_FINAL_VALIDATION_PATH = r"datasets\final\validation"
DATASETS_FINAL_VALIDATION_FEATURES_PATH = r"datasets\final\validation\features"
DATASETS_FINAL_VALIDATION_TARGET_PATH = r"datasets\final\validation\target"
DATASETS_FINAL_TEST_PATH = r"datasets\final\test"
DATASETS_FINAL_TEST_FEATURES_PATH = r"datasets\final\test\features"
DATASETS_FINAL_TEST_TARGET_PATH = r"datasets\final\test\target"
DATASETS_PREPROCESSED_PATH = r"datasets\preprocessed"
DATASETS_PREPROCESSED_TRAIN_PATH = r"datasets\preprocessed\train"
DATASETS_PREPROCESSED_TRAIN_FEATURES_PATH = r"datasets\preprocessed\train\features"
DATASETS_PREPROCESSED_TRAIN_TARGET_PATH = r"datasets\preprocessed\train\target"
DATASETS_PREPROCESSED_VALIDATION_PATH = r"datasets\preprocessed\validation"
DATASETS_PREPROCESSED_VALIDATION_FEATURES_PATH = r"datasets\preprocessed\validation\features"
DATASETS_PREPROCESSED_VALIDATION_TARGET_PATH = r"datasets\preprocessed\validation\target"
DATASETS_PREPROCESSED_TEST_PATH = r"datasets\preprocessed\test"
DATASETS_PREPROCESSED_TEST_FEATURES_PATH = r"datasets\preprocessed\test\features"
DATASETS_PREPROCESSED_TEST_TARGET_PATH = r"datasets\preprocessed\test\target"

##################################
# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
breast_cancer = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "Breast_Cancer_Dataset.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(breast_cancer.shape)

Dataset Dimensions:

(569, 32)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(breast_cancer.dtypes)

Column Names and Data Types:

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object

##################################
# Setting the ID column as row names
##################################
breast_cancer = breast_cancer.set_index("id")

##################################
# Taking a snapshot of the dataset
##################################
breast_cancer.head()

##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(breast_cancer.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Counting the number of duplicated rows
##################################
breast_cancer.duplicated().sum()

np.int64(0)

##################################
# Gathering the data types for each column
##################################
data_type_list = list(breast_cancer.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(breast_cancer.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(breast_cancer)] * len(breast_cancer.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(breast_cancer.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(breast_cancer.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])

0

##################################
# Identifying the rows
# with Fill.Rate < 0.90
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<0.90)]

##################################
# Gathering the indices for each observation
##################################
row_index_list = breast_cancer.index

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(breast_cancer.columns)] * len(breast_cancer))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(breast_cancer.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Identifying the rows
# with missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_index_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Missing.Rate > 0.00
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.00)])

0

##################################
# Formulating the dataset
# with numeric columns only
##################################
breast_cancer_numeric = breast_cancer.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = breast_cancer_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = breast_cancer_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = breast_cancer_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = breast_cancer_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = breast_cancer_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [breast_cancer[x].value_counts(dropna=True).index.tolist()[0] for x in breast_cancer_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [breast_cancer[x].value_counts(dropna=True).index.tolist()[1] for x in breast_cancer_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [breast_cancer_numeric[x].isin([breast_cancer[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in breast_cancer_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [breast_cancer_numeric[x].isin([breast_cancer[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in breast_cancer_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = breast_cancer_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(breast_cancer_numeric)] * len(breast_cancer_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = breast_cancer_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = breast_cancer_numeric.kurtosis()

##################################
# Generating a column quality summary for the numeric column
##################################
numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

#################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

5

##################################
# Identifying the numerical columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))].sort_values(by=['Skewness'], ascending=False))

##################################
# Formulating the dataset
# with categorical columns only
##################################
breast_cancer_categorical = breast_cancer.select_dtypes(include=['category','object'])

##################################
# Gathering the variable names for the categorical column
##################################
categorical_variable_name_list = breast_cancer_categorical.columns

##################################
# Gathering the first mode values for each categorical column
##################################
categorical_first_mode_list = [breast_cancer[x].value_counts().index.tolist()[0] for x in breast_cancer_categorical]

##################################
# Gathering the second mode values for each categorical column
##################################
categorical_second_mode_list = [breast_cancer[x].value_counts().index.tolist()[1] for x in breast_cancer_categorical]

##################################
# Gathering the count of first mode values for each categorical column
##################################
categorical_first_mode_count_list = [breast_cancer_categorical[x].isin([breast_cancer[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in breast_cancer_categorical]

##################################
# Gathering the count of second mode values for each categorical column
##################################
categorical_second_mode_count_list = [breast_cancer_categorical[x].isin([breast_cancer[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in breast_cancer_categorical]

##################################
# Gathering the first mode to second mode ratio for each categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each categorical column
##################################
categorical_unique_count_list = breast_cancer_categorical.nunique(dropna=True)

##################################
# Gathering the number of observations for each categorical column
##################################
categorical_row_count_list = list([len(breast_cancer_categorical)] * len(breast_cancer_categorical.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

##################################
# Generating a column quality summary for the categorical columns
##################################
categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                    categorical_first_mode_list,
                                                    categorical_second_mode_list,
                                                    categorical_first_mode_count_list,
                                                    categorical_second_mode_count_list,
                                                    categorical_first_second_mode_ratio_list,
                                                    categorical_unique_count_list,
                                                    categorical_row_count_list,
                                                    categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Creating a dataset copy
# of the original data
##################################
breast_cancer_baseline = breast_cancer.copy()

##################################
# Performing a general exploration
# of the baseline dataset
##################################
print('Final Dataset Dimensions: ')
display(breast_cancer_baseline.shape)

Final Dataset Dimensions:

(569, 31)

##################################
# Obtaining the distribution of
# of the target variable
##################################
print('Target Variable Breakdown: ')
breast_cancer_breakdown = breast_cancer_baseline.groupby('diagnosis', observed=True).size().reset_index(name='Count')
breast_cancer_breakdown['Percentage'] = (breast_cancer_breakdown['Count'] / len(breast_cancer_baseline)) * 100
display(breast_cancer_breakdown)

Target Variable Breakdown:

##################################
# Formulating the train and test data
# from the final dataset
# by applying stratification and
# using a 75-25 ratio
##################################
breast_cancer_train_initial, breast_cancer_test = train_test_split(breast_cancer_baseline, 
                                                               test_size=0.25, 
                                                               stratify=breast_cancer_baseline['diagnosis'], 
                                                               random_state=987654321)

##################################
# Performing a general exploration
# of the initial training dataset
##################################
X_train_initial = breast_cancer_train_initial.drop('diagnosis', axis = 1)
y_train_initial = breast_cancer_train_initial['diagnosis']
print('Initial Train Dataset Dimensions: ')
display(X_train_initial.shape)
display(y_train_initial.shape)
print('Initial Train Target Variable Breakdown: ')
display(y_train_initial.value_counts())
print('Initial Train Target Variable Proportion: ')
display(y_train_initial.value_counts(normalize = True))

Initial Train Dataset Dimensions:

(426, 30)

(426,)

Initial Train Target Variable Breakdown:

diagnosis
B    267
M    159
Name: count, dtype: int64

Initial Train Target Variable Proportion:

diagnosis
B    0.626761
M    0.373239
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the test dataset
##################################
X_test = breast_cancer_test.drop('diagnosis', axis = 1)
y_test = breast_cancer_test['diagnosis']
print('Test Dataset Dimensions: ')
display(X_test.shape)
display(y_test.shape)
print('Test Target Variable Breakdown: ')
display(y_test.value_counts())
print('Test Target Variable Proportion: ')
display(y_test.value_counts(normalize = True))

Test Dataset Dimensions:

(143, 30)

(143,)

Test Target Variable Breakdown:

diagnosis
B    90
M    53
Name: count, dtype: int64

Test Target Variable Proportion:

diagnosis
B    0.629371
M    0.370629
Name: proportion, dtype: float64

##################################
# Formulating the train and validation data
# from the train dataset
# by applying stratification and
# using a 75-25 ratio
##################################
breast_cancer_train, breast_cancer_validation = train_test_split(breast_cancer_train_initial, 
                                                             test_size=0.25, 
                                                             stratify=breast_cancer_train_initial['diagnosis'], 
                                                             random_state=987654321)

##################################
# Performing a general exploration
# of the final training dataset
##################################
X_train = breast_cancer_train.drop('diagnosis', axis = 1)
y_train = breast_cancer_train['diagnosis']
print('Final Train Dataset Dimensions: ')
display(X_train.shape)
display(y_train.shape)
print('Final Train Target Variable Breakdown: ')
display(y_train.value_counts())
print('Final Train Target Variable Proportion: ')
display(y_train.value_counts(normalize = True))

Final Train Dataset Dimensions:

(319, 30)

(319,)

Final Train Target Variable Breakdown:

diagnosis
B    200
M    119
Name: count, dtype: int64

Final Train Target Variable Proportion:

diagnosis
B    0.626959
M    0.373041
Name: proportion, dtype: float64

##################################
# Performing a general exploration
# of the validation dataset
##################################
X_validation = breast_cancer_validation.drop('diagnosis', axis = 1)
y_validation = breast_cancer_validation['diagnosis']
print('Validation Dataset Dimensions: ')
display(X_validation.shape)
display(y_validation.shape)
print('Validation Target Variable Breakdown: ')
display(y_validation.value_counts())
print('Validation Target Variable Proportion: ')
display(y_validation.value_counts(normalize = True))

Validation Dataset Dimensions:

(107, 30)

(107,)

Validation Target Variable Breakdown:

diagnosis
B    67
M    40
Name: count, dtype: int64

Validation Target Variable Proportion:

diagnosis
B    0.626168
M    0.373832
Name: proportion, dtype: float64

##################################
# Saving the training data
# to the DATASETS_FINAL_TRAIN_PATH
# and DATASETS_FINAL_TRAIN_FEATURES_PATH
# and DATASETS_FINAL_TRAIN_TARGET_PATH
##################################
breast_cancer_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_PATH, "breast_cancer_train.csv"), index=False)
X_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_FEATURES_PATH, "X_train.csv"), index=False)
y_train.to_csv(os.path.join("..", DATASETS_FINAL_TRAIN_TARGET_PATH, "y_train.csv"), index=False)

##################################
# Saving the validation data
# to the DATASETS_FINAL_VALIDATION_PATH
# and DATASETS_FINAL_VALIDATION_FEATURE_PATH
# and DATASETS_FINAL_VALIDATION_TARGET_PATH
##################################
breast_cancer_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_PATH, "breast_cancer_validation.csv"), index=False)
X_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_FEATURES_PATH, "X_validation.csv"), index=False)
y_validation.to_csv(os.path.join("..", DATASETS_FINAL_VALIDATION_TARGET_PATH, "y_validation.csv"), index=False)

##################################
# Saving the test data
# to the DATASETS_FINAL_TEST_PATH
# and DATASETS_FINAL_TEST_FEATURES_PATH
# and DATASETS_FINAL_TEST_TARGET_PATH
##################################
breast_cancer_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_PATH, "breast_cancer_test.csv"), index=False)
X_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_FEATURES_PATH, "X_test.csv"), index=False)
y_test.to_csv(os.path.join("..", DATASETS_FINAL_TEST_TARGET_PATH, "y_test.csv"), index=False)

	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
id
842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

	count	mean	std	min	25%	50%	75%	max
radius_mean	569.0	14.127292	3.524049	6.981000	11.700000	13.370000	15.780000	28.11000
texture_mean	569.0	19.289649	4.301036	9.710000	16.170000	18.840000	21.800000	39.28000
perimeter_mean	569.0	91.969033	24.298981	43.790000	75.170000	86.240000	104.100000	188.50000
area_mean	569.0	654.889104	351.914129	143.500000	420.300000	551.100000	782.700000	2501.00000
smoothness_mean	569.0	0.096360	0.014064	0.052630	0.086370	0.095870	0.105300	0.16340
compactness_mean	569.0	0.104341	0.052813	0.019380	0.064920	0.092630	0.130400	0.34540
concavity_mean	569.0	0.088799	0.079720	0.000000	0.029560	0.061540	0.130700	0.42680
concave points_mean	569.0	0.048919	0.038803	0.000000	0.020310	0.033500	0.074000	0.20120
symmetry_mean	569.0	0.181162	0.027414	0.106000	0.161900	0.179200	0.195700	0.30400
fractal_dimension_mean	569.0	0.062798	0.007060	0.049960	0.057700	0.061540	0.066120	0.09744
radius_se	569.0	0.405172	0.277313	0.111500	0.232400	0.324200	0.478900	2.87300
texture_se	569.0	1.216853	0.551648	0.360200	0.833900	1.108000	1.474000	4.88500
perimeter_se	569.0	2.866059	2.021855	0.757000	1.606000	2.287000	3.357000	21.98000
area_se	569.0	40.337079	45.491006	6.802000	17.850000	24.530000	45.190000	542.20000
smoothness_se	569.0	0.007041	0.003003	0.001713	0.005169	0.006380	0.008146	0.03113
compactness_se	569.0	0.025478	0.017908	0.002252	0.013080	0.020450	0.032450	0.13540
concavity_se	569.0	0.031894	0.030186	0.000000	0.015090	0.025890	0.042050	0.39600
concave points_se	569.0	0.011796	0.006170	0.000000	0.007638	0.010930	0.014710	0.05279
symmetry_se	569.0	0.020542	0.008266	0.007882	0.015160	0.018730	0.023480	0.07895
fractal_dimension_se	569.0	0.003795	0.002646	0.000895	0.002248	0.003187	0.004558	0.02984
radius_worst	569.0	16.269190	4.833242	7.930000	13.010000	14.970000	18.790000	36.04000
texture_worst	569.0	25.677223	6.146258	12.020000	21.080000	25.410000	29.720000	49.54000
perimeter_worst	569.0	107.261213	33.602542	50.410000	84.110000	97.660000	125.400000	251.20000
area_worst	569.0	880.583128	569.356993	185.200000	515.300000	686.500000	1084.000000	4254.00000
smoothness_worst	569.0	0.132369	0.022832	0.071170	0.116600	0.131300	0.146000	0.22260
compactness_worst	569.0	0.254265	0.157336	0.027290	0.147200	0.211900	0.339100	1.05800
concavity_worst	569.0	0.272188	0.208624	0.000000	0.114500	0.226700	0.382900	1.25200
concave points_worst	569.0	0.114606	0.065732	0.000000	0.064930	0.099930	0.161400	0.29100
symmetry_worst	569.0	0.290076	0.061867	0.156500	0.250400	0.282200	0.317900	0.66380
fractal_dimension_worst	569.0	0.083946	0.018061	0.055040	0.071460	0.080040	0.092080	0.20750

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
0	radius_mean	6.981000	14.127292	13.370000	28.11000	12.340000	11.060000	4	3	1.333333	456	569	0.801406	0.942380	0.845522
1	texture_mean	9.710000	19.289649	18.840000	39.28000	16.840000	19.830000	3	3	1.000000	479	569	0.841828	0.650450	0.758319
2	perimeter_mean	43.790000	91.969033	86.240000	188.50000	82.610000	134.700000	3	3	1.000000	522	569	0.917399	0.990650	0.972214
3	area_mean	143.500000	654.889104	551.100000	2501.00000	512.200000	394.100000	3	2	1.500000	539	569	0.947276	1.645732	3.652303
4	smoothness_mean	0.052630	0.096360	0.095870	0.16340	0.100700	0.105400	5	4	1.250000	474	569	0.833040	0.456324	0.855975
5	compactness_mean	0.019380	0.104341	0.092630	0.34540	0.114700	0.120600	3	3	1.000000	537	569	0.943761	1.190123	1.650130
6	concavity_mean	0.000000	0.088799	0.061540	0.42680	0.000000	0.120400	13	3	4.333333	537	569	0.943761	1.401180	1.998638
7	concave points_mean	0.000000	0.048919	0.033500	0.20120	0.000000	0.028640	13	3	4.333333	542	569	0.952548	1.171180	1.066556
8	symmetry_mean	0.106000	0.181162	0.179200	0.30400	0.176900	0.189300	4	4	1.000000	432	569	0.759227	0.725609	1.287933
9	fractal_dimension_mean	0.049960	0.062798	0.061540	0.09744	0.067820	0.061130	3	3	1.000000	499	569	0.876977	1.304489	3.005892
10	radius_se	0.111500	0.405172	0.324200	2.87300	0.286000	0.220400	3	3	1.000000	540	569	0.949033	3.088612	17.686726
11	texture_se	0.360200	1.216853	1.108000	4.88500	0.856100	1.350000	3	3	1.000000	519	569	0.912127	1.646444	5.349169
12	perimeter_se	0.757000	2.866059	2.287000	21.98000	1.778000	1.143000	4	2	2.000000	533	569	0.936731	3.443615	21.401905
13	area_se	6.802000	40.337079	24.530000	542.20000	16.970000	16.640000	3	3	1.000000	528	569	0.927944	5.447186	49.209077
14	smoothness_se	0.001713	0.007041	0.006380	0.03113	0.005910	0.006064	2	2	1.000000	547	569	0.961336	2.314450	10.469840
15	compactness_se	0.002252	0.025478	0.020450	0.13540	0.018120	0.011040	3	3	1.000000	541	569	0.950791	1.902221	5.106252
16	concavity_se	0.000000	0.031894	0.025890	0.39600	0.000000	0.021850	13	2	6.500000	533	569	0.936731	5.110463	48.861395
17	concave points_se	0.000000	0.011796	0.010930	0.05279	0.000000	0.011670	13	3	4.333333	507	569	0.891037	1.444678	5.126302
18	symmetry_se	0.007882	0.020542	0.018730	0.07895	0.013440	0.020450	4	3	1.333333	498	569	0.875220	2.195133	7.896130
19	fractal_dimension_se	0.000895	0.003795	0.003187	0.02984	0.002256	0.002205	2	2	1.000000	545	569	0.957821	3.923969	26.280847
20	radius_worst	7.930000	16.269190	14.970000	36.04000	12.360000	13.500000	5	4	1.250000	457	569	0.803163	1.103115	0.944090
21	texture_worst	12.020000	25.677223	25.410000	49.54000	17.700000	27.260000	3	3	1.000000	511	569	0.898067	0.498321	0.224302
22	perimeter_worst	50.410000	107.261213	97.660000	251.20000	117.700000	105.900000	3	3	1.000000	514	569	0.903339	1.128164	1.070150
23	area_worst	185.200000	880.583128	686.500000	4254.00000	698.800000	808.900000	2	2	1.000000	544	569	0.956063	1.859373	4.396395
24	smoothness_worst	0.071170	0.132369	0.131300	0.22260	0.140100	0.131200	4	4	1.000000	411	569	0.722320	0.415426	0.517825
25	compactness_worst	0.027290	0.254265	0.211900	1.05800	0.148600	0.341600	3	3	1.000000	529	569	0.929701	1.473555	3.039288
26	concavity_worst	0.000000	0.272188	0.226700	1.25200	0.000000	0.450400	13	3	4.333333	539	569	0.947276	1.150237	1.615253
27	concave points_worst	0.000000	0.114606	0.099930	0.29100	0.000000	0.110500	13	3	4.333333	492	569	0.864675	0.492616	-0.535535
28	symmetry_worst	0.156500	0.290076	0.282200	0.66380	0.236900	0.310900	3	3	1.000000	500	569	0.878735	1.433928	4.444560
29	fractal_dimension_worst	0.055040	0.083946	0.080040	0.20750	0.074270	0.087010	3	2	1.500000	535	569	0.940246	1.662579	5.244611

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
13	area_se	6.802000	40.337079	24.530000	542.20000	16.970000	16.640000	3	3	1.0	528	569	0.927944	5.447186	49.209077
16	concavity_se	0.000000	0.031894	0.025890	0.39600	0.000000	0.021850	13	2	6.5	533	569	0.936731	5.110463	48.861395
19	fractal_dimension_se	0.000895	0.003795	0.003187	0.02984	0.002256	0.002205	2	2	1.0	545	569	0.957821	3.923969	26.280847
12	perimeter_se	0.757000	2.866059	2.287000	21.98000	1.778000	1.143000	4	2	2.0	533	569	0.936731	3.443615	21.401905
10	radius_se	0.111500	0.405172	0.324200	2.87300	0.286000	0.220400	3	3	1.0	540	569	0.949033	3.088612	17.686726

	diagnosis	Count	Percentage
0	B	357	62.741652
1	M	212	37.258348

Model Deployment : Detecting and Analyzing Machine Learning Model Drift Using Open-Source Monitoring Tools¶

John Pauline Pineda

October 15, 2025

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Splitting ¶

1.4.2 Outlier and Distributional Shape Analysis ¶

1.4.3 Collinearity ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Premodelling Data Preparation ¶

1.6.1 Preprocessed Data Description ¶

1.6.2 Preprocessing Pipeline Development ¶

1.7. Model Development and Validation ¶

1.7.1 Random Forest ¶

1.7.2 AdaBoost ¶

1.7.3 Gradient Boosting ¶

1.7.4 XGBoost ¶

1.7.5 Light GBM ¶

1.7.6 CatBoost ¶

1.8. Model Monitoring using the NannyML Framework ¶

1.8.1 Baseline Control ¶

1.8.2 Simulated Covariate Drift ¶

1.8.3 Simulated Prior Shift ¶

1.8.4 Simulated Concept Drift ¶

1.8.5 Simulated Missingness Spike ¶

1.8.6 Simulated Seasonal Pattern ¶

1.9. Consolidated Findings ¶

2. Summary ¶

3. References ¶

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Fill.Rate
0	diagnosis	object	569	569	1.0
1	radius_mean	float64	569	569	1.0
2	texture_mean	float64	569	569	1.0
3	perimeter_mean	float64	569	569	1.0
4	area_mean	float64	569	569	1.0
5	smoothness_mean	float64	569	569	1.0
6	compactness_mean	float64	569	569	1.0
7	concavity_mean	float64	569	569	1.0
8	concave points_mean	float64	569	569	1.0
9	symmetry_mean	float64	569	569	1.0
10	fractal_dimension_mean	float64	569	569	1.0
11	radius_se	float64	569	569	1.0
12	texture_se	float64	569	569	1.0
13	perimeter_se	float64	569	569	1.0
14	area_se	float64	569	569	1.0
15	smoothness_se	float64	569	569	1.0
16	compactness_se	float64	569	569	1.0
17	concavity_se	float64	569	569	1.0
18	concave points_se	float64	569	569	1.0
19	symmetry_se	float64	569	569	1.0
20	fractal_dimension_se	float64	569	569	1.0
21	radius_worst	float64	569	569	1.0
22	texture_worst	float64	569	569	1.0
23	perimeter_worst	float64	569	569	1.0
24	area_worst	float64	569	569	1.0
25	smoothness_worst	float64	569	569	1.0
26	compactness_worst	float64	569	569	1.0
27	concavity_worst	float64	569	569	1.0
28	concave points_worst	float64	569	569	1.0
29	symmetry_worst	float64	569	569	1.0
30	fractal_dimension_worst	float64	569	569	1.0

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	842302	31	0	0.0
1	842517	31	0	0.0
2	84300903	31	0	0.0
3	84348301	31	0	0.0
4	84358402	31	0	0.0
...	...	...	...	...
564	926424	31	0	0.0
565	926682	31	0	0.0
566	926954	31	0	0.0
567	927241	31	0	0.0
568	92751	31	0	0.0

Model Deployment : Detecting and Analyzing Machine Learning Model Drift Using Open-Source Monitoring Tools¶

John Pauline Pineda October 15, 2025

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Splitting¶

1.4.2 Outlier and Distributional Shape Analysis¶

1.4.3 Collinearity¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis¶

1.5.2 Hypothesis Testing¶

1.6. Premodelling Data Preparation ¶

1.6.1 Preprocessed Data Description¶

1.6.2 Preprocessing Pipeline Development¶

1.7. Model Development and Validation ¶

1.7.1 Random Forest¶

1.7.2 AdaBoost¶

1.7.3 Gradient Boosting¶

1.7.4 XGBoost¶

1.7.5 Light GBM¶

1.7.6 CatBoost¶

1.8. Model Monitoring using the NannyML Framework ¶

1.8.1 Baseline Control¶

1.8.2 Simulated Covariate Drift¶

1.8.3 Simulated Prior Shift¶

1.8.4 Simulated Concept Drift¶

1.8.5 Simulated Missingness Spike¶

1.8.6 Simulated Seasonal Pattern¶

1.9. Consolidated Findings ¶

2. Summary ¶

3. References ¶

John Pauline Pineda

October 15, 2025

1.4.1 Data Splitting ¶

1.4.2 Outlier and Distributional Shape Analysis ¶

1.4.3 Collinearity ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6.1 Preprocessed Data Description ¶

1.6.2 Preprocessing Pipeline Development ¶

1.7.1 Random Forest ¶

1.7.2 AdaBoost ¶

1.7.3 Gradient Boosting ¶

1.7.4 XGBoost ¶

1.7.5 Light GBM ¶

1.7.6 CatBoost ¶

1.8.1 Baseline Control ¶

1.8.2 Simulated Covariate Drift ¶

1.8.3 Simulated Prior Shift ¶

1.8.4 Simulated Concept Drift ¶

1.8.5 Simulated Missingness Spike ¶

1.8.6 Simulated Seasonal Pattern ¶