##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import os
%matplotlib inline

from operator import add,mul,truediv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from scipy import stats

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"

##################################
# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
cancer_rate = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "CategoricalCancerRates.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate.dtypes)

Column Names and Data Types:

COUNTRY     object
CANRAT      object
GDPPER     float64
URBPOP     float64
PATRES     float64
RNDGDP     float64
POPGRO     float64
LIFEXP     float64
TUBINC     float64
DTHCMD     float64
AGRLND     float64
GHGEMI     float64
RELOUT     float64
METEMI     float64
FORARE     float64
CO2EMI     float64
PM2EXP     float64
POPDEN     float64
ENRTER     float64
GDPCAP     float64
HDICAT      object
EPISCO     float64
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
cancer_rate.head()

##################################
# Setting the levels of the categorical variables
##################################
cancer_rate['CANRAT'] = cancer_rate['CANRAT'].astype('category')
cancer_rate['CANRAT'] = cancer_rate['CANRAT'].cat.set_categories(['Low', 'High'], ordered=True)
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].astype('category')
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].cat.set_categories(['L', 'M', 'H', 'VH'], ordered=True)

##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(cancer_rate.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Performing a general exploration of the object variable
##################################
print('Object Variable Summary:')
display(cancer_rate.describe(include='object').transpose())

Object Variable Summary:

##################################
# Performing a general exploration of the categorical variables
##################################
print('Categorical Variable Summary:')
display(cancer_rate.describe(include='category').transpose())

Categorical Variable Summary:

##################################
# Counting the number of duplicated rows
##################################
cancer_rate.duplicated().sum()

np.int64(0)

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate)] * len(cancer_rate.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])

20

##################################
# Identifying the columns
# with Fill.Rate < 1.00
##################################
display(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)].sort_values(by=['Fill.Rate'], ascending=True))

##################################
# Identifying the rows
# with Fill.Rate < 0.90
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<0.90)]

##################################
# Gathering the metadata labels for each observation
##################################
row_metadata_list = cancer_rate["COUNTRY"].values.tolist()

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(cancer_rate.columns)] * len(cancer_rate))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(cancer_rate.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Identifying the rows
# with missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_metadata_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Missing.Rate > 0.00
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.00)])

120

##################################
# Counting the number of rows
# with Missing.Rate > 0.20
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)])

14

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
row_high_missing_rate = all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)]

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
display(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)].sort_values(by=['Missing.Rate'], ascending=False))

##################################
# Formulating the dataset
# with numeric columns only
##################################
cancer_rate_numeric = cancer_rate.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = cancer_rate_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = cancer_rate_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = cancer_rate_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = cancer_rate_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = cancer_rate_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[0] for x in cancer_rate_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[1] for x in cancer_rate_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = cancer_rate_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(cancer_rate_numeric)] * len(cancer_rate_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = cancer_rate_numeric.kurtosis()

numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)])

1

##################################
# Identifying the numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)].sort_values(by=['First.Second.Mode.Ratio'], ascending=False))

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

5

##################################
# Identifying the numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))].sort_values(by=['Skewness'], ascending=False))

##################################
# Formulating the dataset
# with object column only
##################################
cancer_rate_object = cancer_rate.select_dtypes(include='object')

##################################
# Gathering the variable names for the object column
##################################
object_variable_name_list = cancer_rate_object.columns

##################################
# Gathering the first mode values for the object column
##################################
object_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_object]

##################################
# Gathering the second mode values for each object column
##################################
object_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_object]

##################################
# Gathering the count of first mode values for each object column
##################################
object_first_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_object]

##################################
# Gathering the count of second mode values for each object column
##################################
object_second_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_object]

##################################
# Gathering the first mode to second mode ratio for each object column
##################################
object_first_second_mode_ratio_list = map(truediv, object_first_mode_count_list, object_second_mode_count_list)

##################################
# Gathering the count of unique values for each object column
##################################
object_unique_count_list = cancer_rate_object.nunique(dropna=True)

##################################
# Gathering the number of observations for each object column
##################################
object_row_count_list = list([len(cancer_rate_object)] * len(cancer_rate_object.columns))

##################################
# Gathering the unique to count ratio for each object column
##################################
object_unique_count_ratio_list = map(truediv, object_unique_count_list, object_row_count_list)

object_column_quality_summary = pd.DataFrame(zip(object_variable_name_list,
                                                 object_first_mode_list,
                                                 object_second_mode_list,
                                                 object_first_mode_count_list,
                                                 object_second_mode_count_list,
                                                 object_first_second_mode_ratio_list,
                                                 object_unique_count_list,
                                                 object_row_count_list,
                                                 object_unique_count_ratio_list), 
                                        columns=['Object.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(object_column_quality_summary)

##################################
# Counting the number of object columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of object columns
# with Unique.Count.Ratio > 10.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Formulating the dataset
# with categorical columns only
##################################
cancer_rate_categorical = cancer_rate.select_dtypes(include='category')

##################################
# Gathering the variable names for the categorical column
##################################
categorical_variable_name_list = cancer_rate_categorical.columns

##################################
# Gathering the first mode values for each categorical column
##################################
categorical_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_categorical]

##################################
# Gathering the second mode values for each categorical column
##################################
categorical_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_categorical]

##################################
# Gathering the count of first mode values for each categorical column
##################################
categorical_first_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the count of second mode values for each categorical column
##################################
categorical_second_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the first mode to second mode ratio for each categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each categorical column
##################################
categorical_unique_count_list = cancer_rate_categorical.nunique(dropna=True)

##################################
# Gathering the number of observations for each categorical column
##################################
categorical_row_count_list = list([len(cancer_rate_categorical)] * len(cancer_rate_categorical.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                    categorical_first_mode_list,
                                                    categorical_second_mode_list,
                                                    categorical_first_mode_count_list,
                                                    categorical_second_mode_count_list,
                                                    categorical_first_second_mode_ratio_list,
                                                    categorical_unique_count_list,
                                                    categorical_row_count_list,
                                                    categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Performing a general exploration of the original dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Filtering out the rows with
# with Missing.Rate > 0.20
##################################
cancer_rate_filtered_row = cancer_rate.drop(cancer_rate[cancer_rate.COUNTRY.isin(row_high_missing_rate['Row.Name'].values.tolist())].index)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_filtered_row.shape)

Dataset Dimensions:

(163, 22)

##################################
# Filtering out the columns with
# with Fill.Rate < 0.90
##################################
cancer_rate_filtered_row_column = cancer_rate_filtered_row.drop(column_low_fill_rate['Column.Name'].values.tolist(), axis=1)

##################################
# Formulating a new dataset object
# for the cleaned data
##################################
cancer_rate_cleaned = cancer_rate_filtered_row_column

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_cleaned.shape)

Dataset Dimensions:

(163, 18)

##################################
# Formulating the summary
# for all cleaned columns
##################################
cleaned_column_quality_summary = pd.DataFrame(zip(list(cancer_rate_cleaned.columns),
                                                  list(cancer_rate_cleaned.dtypes),
                                                  list([len(cancer_rate_cleaned)] * len(cancer_rate_cleaned.columns)),
                                                  list(cancer_rate_cleaned.count()),
                                                  list(cancer_rate_cleaned.isna().sum(axis=0))), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count'])
display(cleaned_column_quality_summary)

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='object')

##################################
# Formulating the cleaned dataset
# with numeric columns only
##################################
cancer_rate_cleaned_numeric = cancer_rate_cleaned.select_dtypes(include='number')

##################################
# Taking a snapshot of the cleaned dataset
##################################
cancer_rate_cleaned_numeric.head()

##################################
# Defining the estimator to be used
# at each step of the round-robin imputation
##################################
lr = LinearRegression()

##################################
# Defining the parameter of the
# iterative imputer which will estimate 
# the columns with missing values
# as a function of the other columns
# in a round-robin fashion
##################################
iterative_imputer = IterativeImputer(
    estimator = lr,
    max_iter = 10,
    tol = 1e-10,
    imputation_order = 'ascending',
    random_state=88888888
)

##################################
# Implementing the iterative imputer 
##################################
cancer_rate_imputed_numeric_array = iterative_imputer.fit_transform(cancer_rate_cleaned_numeric)

##################################
# Transforming the imputed data
# from an array to a dataframe
##################################
cancer_rate_imputed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array, 
                                           columns = cancer_rate_cleaned_numeric.columns)

##################################
# Taking a snapshot of the imputed dataset
##################################
cancer_rate_imputed_numeric.head()

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='category')

##################################
# Imputing the missing data
# for categorical columns with
# the most frequent category
##################################
cancer_rate_cleaned_categorical['HDICAT'] = cancer_rate_cleaned_categorical['HDICAT'].fillna(cancer_rate_cleaned_categorical['HDICAT'].mode()[0])
cancer_rate_imputed_categorical = cancer_rate_cleaned_categorical.reset_index(drop=True)

##################################
# Formulating the imputed dataset
##################################
cancer_rate_imputed = pd.concat([cancer_rate_imputed_numeric,cancer_rate_imputed_categorical], axis=1, join='inner')

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate_imputed.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate_imputed.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate_imputed)] * len(cancer_rate_imputed.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate_imputed.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate_imputed.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all imputed columns
##################################
imputed_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                                  data_type_list,
                                                  row_count_list,
                                                  non_null_count_list,
                                                  null_count_list,
                                                  fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(imputed_column_quality_summary)

##################################
# Formulating the imputed dataset
# with numeric columns only
##################################
cancer_rate_imputed_numeric = cancer_rate_imputed.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = list(cancer_rate_imputed_numeric.columns)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_imputed_numeric.skew()

##################################
# Computing the interquartile range
# for all columns
##################################
cancer_rate_imputed_numeric_q1 = cancer_rate_imputed_numeric.quantile(0.25)
cancer_rate_imputed_numeric_q3 = cancer_rate_imputed_numeric.quantile(0.75)
cancer_rate_imputed_numeric_iqr = cancer_rate_imputed_numeric_q3 - cancer_rate_imputed_numeric_q1

##################################
# Gathering the outlier count for each numeric column
# based on the interquartile range criterion
##################################
numeric_outlier_count_list = ((cancer_rate_imputed_numeric < (cancer_rate_imputed_numeric_q1 - 1.5 * cancer_rate_imputed_numeric_iqr)) | (cancer_rate_imputed_numeric > (cancer_rate_imputed_numeric_q3 + 1.5 * cancer_rate_imputed_numeric_iqr))).sum()

##################################
# Gathering the number of observations for each column
##################################
numeric_row_count_list = list([len(cancer_rate_imputed_numeric)] * len(cancer_rate_imputed_numeric.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
numeric_outlier_ratio_list = map(truediv, numeric_outlier_count_list, numeric_row_count_list)

##################################
# Formulating the outlier summary
# for all numeric columns
##################################
numeric_column_outlier_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                  numeric_skewness_list,
                                                  numeric_outlier_count_list,
                                                  numeric_row_count_list,
                                                  numeric_outlier_ratio_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Skewness',
                                                 'Outlier.Count',
                                                 'Row.Count',
                                                 'Outlier.Ratio'])
display(numeric_column_outlier_summary)

##################################
# Formulating the individual boxplots
# for all numeric columns
##################################
for column in cancer_rate_imputed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_imputed_numeric, x=column)

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
def plot_correlation_matrix(corr, mask=None):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, 
                ax=ax,
                mask=mask,
                annot=True, 
                vmin=-1, 
                vmax=1, 
                center=0,
                cmap='coolwarm', 
                linewidths=1, 
                linecolor='gray', 
                cbar_kws={'orientation': 'horizontal'})

##################################
# Computing the correlation coefficients
# and correlation p-values
# among pairs of numeric columns
##################################
cancer_rate_imputed_numeric_correlation_pairs = {}
cancer_rate_imputed_numeric_columns = cancer_rate_imputed_numeric.columns.tolist()
for numeric_column_a, numeric_column_b in itertools.combinations(cancer_rate_imputed_numeric_columns, 2):
    cancer_rate_imputed_numeric_correlation_pairs[numeric_column_a + '_' + numeric_column_b] = stats.pearsonr(
        cancer_rate_imputed_numeric.loc[:, numeric_column_a], 
        cancer_rate_imputed_numeric.loc[:, numeric_column_b])

##################################
# Formulating the pairwise correlation summary
# for all numeric columns
##################################
cancer_rate_imputed_numeric_summary = cancer_rate_imputed_numeric.from_dict(cancer_rate_imputed_numeric_correlation_pairs, orient='index')
cancer_rate_imputed_numeric_summary.columns = ['Pearson.Correlation.Coefficient', 'Correlation.PValue']
display(cancer_rate_imputed_numeric_summary.sort_values(by=['Pearson.Correlation.Coefficient'], ascending=False).head(20))

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
cancer_rate_imputed_numeric_correlation = cancer_rate_imputed_numeric.corr()
mask = np.triu(cancer_rate_imputed_numeric_correlation)
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)
plt.show()

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
def correlation_significance(df=None):
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
    return p_matrix

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
cancer_rate_imputed_numeric_correlation_p_values = correlation_significance(cancer_rate_imputed_numeric)                     
mask = np.invert(np.tril(cancer_rate_imputed_numeric_correlation_p_values<0.05)) 
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)

##################################
# Filtering out one among the 
# highly correlated variable pairs with
# lesser Pearson.Correlation.Coefficient
# when compared to the target variable
##################################
cancer_rate_imputed_numeric.drop(['GDPPER','METEMI'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_imputed_numeric.shape)

Dataset Dimensions:

(163, 13)

##################################
# Conducting a Yeo-Johnson Transformation
# to address the distributional
# shape of the variables
##################################
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson',
                                          standardize=False)
cancer_rate_imputed_numeric_array = yeo_johnson_transformer.fit_transform(cancer_rate_imputed_numeric)

##################################
# Formulating a new dataset object
# for the transformed data
##################################
cancer_rate_transformed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array,
                                               columns=cancer_rate_imputed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_transformed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_transformed_numeric, x=column)

##################################
# Filtering out the column
# which remained skewed even
# after applying shape transformation
##################################
cancer_rate_transformed_numeric.drop(['PM2EXP'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_transformed_numeric.shape)

Dataset Dimensions:

(163, 12)

##################################
# Conducting standardization
# to transform the values of the 
# variables into comparable scale
##################################
standardization_scaler = StandardScaler()
cancer_rate_transformed_numeric_array = standardization_scaler.fit_transform(cancer_rate_transformed_numeric)

##################################
# Formulating a new dataset object
# for the scaled data
##################################
cancer_rate_scaled_numeric = pd.DataFrame(cancer_rate_transformed_numeric_array,
                                          columns=cancer_rate_transformed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_scaled_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_scaled_numeric, x=column)

##################################
# Formulating the categorical column
# for encoding transformation
##################################
cancer_rate_categorical_encoded = pd.DataFrame(cancer_rate_cleaned_categorical.loc[:, 'HDICAT'].to_list(),
                                               columns=['HDICAT'])

##################################
# Applying a one-hot encoding transformation
# for the categorical column
##################################
cancer_rate_categorical_encoded = pd.get_dummies(cancer_rate_categorical_encoded, columns=['HDICAT'])

##################################
# Consolidating both numeric columns
# and encoded categorical columns
##################################
cancer_rate_preprocessed = pd.concat([cancer_rate_scaled_numeric,cancer_rate_categorical_encoded], axis=1, join='inner')

##################################
# Performing a general exploration of the consolidated dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_preprocessed.shape)

Dataset Dimensions:

(163, 16)

##################################
# Segregating the target
# and predictor variable lists
##################################
cancer_rate_preprocessed_target = cancer_rate_filtered_row['CANRAT'].to_frame()
cancer_rate_preprocessed_target.reset_index(inplace=True, drop=True)
cancer_rate_preprocessed_categorical = cancer_rate_preprocessed[cancer_rate_categorical_encoded.columns]
cancer_rate_preprocessed_categorical_combined = cancer_rate_preprocessed_categorical.join(cancer_rate_preprocessed_target)
cancer_rate_preprocessed = cancer_rate_preprocessed.drop(cancer_rate_categorical_encoded.columns, axis=1) 
cancer_rate_preprocessed_predictors = cancer_rate_preprocessed.columns
cancer_rate_preprocessed_combined = cancer_rate_preprocessed.join(cancer_rate_preprocessed_target)

##################################
# Segregating the target
# and predictor variable names
##################################
y_variable = 'CANRAT'
x_variables = cancer_rate_preprocessed_predictors

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 6
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 30))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual boxplots
# for all scaled numeric columns
##################################
for i, x_variable in enumerate(x_variables):
    ax = axes[i]
    ax.boxplot([group[x_variable] for name, group in cancer_rate_preprocessed_combined.groupby(y_variable, observed=True)])
    ax.set_title(f'{y_variable} Versus {x_variable}')
    ax.set_xlabel(y_variable)
    ax.set_ylabel(x_variable)
    ax.set_xticks(range(1, len(cancer_rate_preprocessed_combined[y_variable].unique()) + 1), ['Low', 'High'])

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Segregating the target
# and predictor variable names
##################################
y_variables = cancer_rate_preprocessed_categorical.columns
x_variable = 'CANRAT'

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 2
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual stacked column plots
# for all categorical columns
##################################
for i, y_variable in enumerate(y_variables):
    ax = axes[i]
    category_counts = cancer_rate_preprocessed_categorical_combined.groupby([x_variable, y_variable], observed=True).size().unstack(fill_value=0)
    category_proportions = category_counts.div(category_counts.sum(axis=1), axis=0)
    category_proportions.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'{x_variable} Versus {y_variable}')
    ax.set_xlabel(x_variable)
    ax.set_ylabel('Proportions')

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Computing the t-test 
# statistic and p-values
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_ttest_target = {}
cancer_rate_preprocessed_numeric = cancer_rate_preprocessed_combined
cancer_rate_preprocessed_numeric_columns = cancer_rate_preprocessed_predictors
for numeric_column in cancer_rate_preprocessed_numeric_columns:
    group_0 = cancer_rate_preprocessed_numeric[cancer_rate_preprocessed_numeric.loc[:,'CANRAT']=='Low']
    group_1 = cancer_rate_preprocessed_numeric[cancer_rate_preprocessed_numeric.loc[:,'CANRAT']=='High']
    cancer_rate_preprocessed_numeric_ttest_target['CANRAT_' + numeric_column] = stats.ttest_ind(
        group_0[numeric_column], 
        group_1[numeric_column], 
        equal_var=True)

##################################
# Formulating the pairwise ttest summary
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_summary = cancer_rate_preprocessed_numeric.from_dict(cancer_rate_preprocessed_numeric_ttest_target, orient='index')
cancer_rate_preprocessed_numeric_summary.columns = ['T.Test.Statistic', 'T.Test.PValue']
display(cancer_rate_preprocessed_numeric_summary.sort_values(by=['T.Test.PValue'], ascending=True).head(12))

##################################
# Computing the chisquare
# statistic and p-values
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_chisquare_target = {}
cancer_rate_preprocessed_categorical = cancer_rate_preprocessed_categorical_combined
cancer_rate_preprocessed_categorical_columns = ['HDICAT_L','HDICAT_M','HDICAT_H','HDICAT_VH']
for categorical_column in cancer_rate_preprocessed_categorical_columns:
    contingency_table = pd.crosstab(cancer_rate_preprocessed_categorical[categorical_column], 
                                    cancer_rate_preprocessed_categorical['CANRAT'])
    cancer_rate_preprocessed_categorical_chisquare_target['CANRAT_' + categorical_column] = stats.chi2_contingency(
        contingency_table)[0:2]

##################################
# Formulating the pairwise chisquare summary
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_summary = cancer_rate_preprocessed_categorical.from_dict(cancer_rate_preprocessed_categorical_chisquare_target, orient='index')
cancer_rate_preprocessed_categorical_summary.columns = ['ChiSquare.Test.Statistic', 'ChiSquare.Test.PValue']
display(cancer_rate_preprocessed_categorical_summary.sort_values(by=['ChiSquare.Test.PValue'], ascending=True).head(4))

##################################
# Assignining all numeric columns
# as model predictors
##################################
cancer_rate_premodelling = cancer_rate_preprocessed_combined
cancer_rate_premodelling.columns

Index(['URBPOP', 'POPGRO', 'LIFEXP', 'TUBINC', 'DTHCMD', 'AGRLND', 'GHGEMI',
       'FORARE', 'CO2EMI', 'POPDEN', 'GDPCAP', 'EPISCO', 'CANRAT'],
      dtype='object')

##################################
# Performing a general exploration of the pre-modelling dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 13)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

URBPOP     float64
POPGRO     float64
LIFEXP     float64
TUBINC     float64
DTHCMD     float64
AGRLND     float64
GHGEMI     float64
FORARE     float64
CO2EMI     float64
POPDEN     float64
GDPCAP     float64
EPISCO     float64
CANRAT    category
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
cancer_rate_premodelling.head()

##################################
# Formulating a scatterplot matrix
# of all pairwise combinations of 
# numeric predictors labeled by
# categorical response classes
##################################
sns.pairplot(cancer_rate_premodelling, hue='CANRAT')
plt.show()

##################################
# Converting the dataframe to
# a numpy array
##################################
cancer_rate_premodelling_matrix = cancer_rate_premodelling.to_numpy()

##################################
# Preparing the data and
# and converting to a suitable format
# as a neural network model input
##################################
matrix_x_values = cancer_rate_premodelling.iloc[:,0:12].to_numpy()
y_values_series = np.where(cancer_rate_premodelling['CANRAT'] == 'High', 1, 0)
y_values = pd.get_dummies(y_values_series)
y_values = y_values.to_numpy()

##################################
# Defining the neural network architecture
##################################
input_size = 12
hidden_sizes = [3, 3, 3]
output_size = 2

##################################
# Defining the training parameters
##################################
learning_rate = 0.01
iterations = 5001

##################################
# Initializing model weights and biases
##################################
def initialize_parameters(input_size, hidden_sizes, output_size):
    parameters = {}
    layer_sizes = [input_size] + hidden_sizes + [output_size]
    for i in range(1, len(layer_sizes)):
        parameters[f'W{i}'] = np.random.randn(layer_sizes[i-1], layer_sizes[i])
        parameters[f'b{i}'] = np.zeros((1, layer_sizes[i]))
    return parameters

##################################
# Defining the activation function (ReLU)
##################################
def relu(x):
    return np.maximum(0, x)

##################################
# Defining the Softmax function
##################################
def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

##################################
# Defining the Forward propagation algorithm
##################################
def forward_propagation(X, parameters):
    cache = {'A0': X}
    for i in range(1, len(parameters) // 2 + 1):
        cache[f'Z{i}'] = np.dot(cache[f'A{i-1}'], parameters[f'W{i}']) + parameters[f'b{i}']
        cache[f'A{i}'] = relu(cache[f'Z{i}']) if i != len(parameters) // 2 else softmax(cache[f'Z{i}'])
    return cache

##################################
# Defining the Backward propagation algorithm
##################################
def backward_propagation(X, Y, parameters, cache):
    m = X.shape[0]
    gradients = {}
    dZ = cache[f'A{len(parameters) // 2}'] - Y
    for i in range(len(parameters) // 2, 0, -1):
        gradients[f'dW{i}'] = (1 / m) * np.dot(cache[f'A{i-1}'].T, dZ)
        gradients[f'db{i}'] = (1 / m) * np.sum(dZ, axis=0, keepdims=True)
        if i > 1:
            dA = np.dot(dZ, parameters[f'W{i}'].T)
            dZ = dA * (cache[f'Z{i-1}'] > 0)
    return gradients

##################################
# Defining the Cross-entropy loss
##################################
def compute_cost(Y, Y_hat):
    m = Y.shape[0]
    logprobs = -np.log(Y_hat[range(m), np.argmax(Y, axis=1)])
    return np.sum(logprobs) / m

##################################
# Updating model parameters
# with no regularization
##################################
def update_parameters_no_reg(parameters, gradients, learning_rate):
    for i in range(1, len(parameters) // 2 + 1):
        parameters[f'W{i}'] -= learning_rate * gradients[f'dW{i}']
        parameters[f'b{i}'] -= learning_rate * gradients[f'db{i}']
    return parameters

##################################
# Implementing neural network model training
# using no regularization
##################################

##################################
# Initializing training parameters
##################################
np.random.seed(88888)
parameters = initialize_parameters(input_size, hidden_sizes, output_size)

##################################
# Creating lists to store cost and accuracy for plotting
##################################
costs = []
accuracies = []

##################################
# Creating lists to store weights for plotting
##################################
weight_history = {f'W{i}': [] for i in range(1, len(hidden_sizes) + 2)}

##################################
# Training a neural network model
# using no regularization
##################################
for i in range(iterations):
    # Implementing forward propagation
    cache = forward_propagation(matrix_x_values, parameters)
    Y_hat = cache[f'A{len(parameters) // 2}']
    
    # Computing cost and accuracy
    cost = compute_cost(y_values, Y_hat)
    accuracy = np.mean(np.argmax(y_values, axis=1) == np.argmax(Y_hat, axis=1))
    
    # Implementing backward propagation
    gradients = backward_propagation(matrix_x_values, y_values, parameters, cache)
    
    # Updating model parameter values
    parameters = update_parameters_no_reg(parameters, gradients, learning_rate)
    
    # Recording model weight values
    for j in range(1, len(hidden_sizes) + 2):
        weight_history[f'W{j}'].append(parameters[f'W{j}'].copy())
    
    # Recording cost and accuracy values
    costs.append(cost)
    accuracies.append(accuracy)
    
    # Printing cost and accuracy every 100 iterations
    if i % 100 == 0:
        print(f"Iteration {i}: Cost = {cost}, Accuracy = {accuracy}")

Iteration 0: Cost = 0.6655730513372478, Accuracy = 0.7484662576687117
Iteration 100: Cost = 0.44337999241499515, Accuracy = 0.7484662576687117
Iteration 200: Cost = 0.42965273053345476, Accuracy = 0.7484662576687117
Iteration 300: Cost = 0.4207049714278549, Accuracy = 0.7484662576687117
Iteration 400: Cost = 0.41306128303874956, Accuracy = 0.7484662576687117
Iteration 500: Cost = 0.4061728709319091, Accuracy = 0.7484662576687117
Iteration 600: Cost = 0.3999572426027226, Accuracy = 0.7484662576687117
Iteration 700: Cost = 0.3945876944310266, Accuracy = 0.7484662576687117
Iteration 800: Cost = 0.3894959618753931, Accuracy = 0.7484662576687117
Iteration 900: Cost = 0.3849751734735514, Accuracy = 0.7484662576687117
Iteration 1000: Cost = 0.38094362023452166, Accuracy = 0.7975460122699386
Iteration 1100: Cost = 0.3772707519990685, Accuracy = 0.7852760736196319
Iteration 1200: Cost = 0.37387392111905915, Accuracy = 0.7914110429447853
Iteration 1300: Cost = 0.37065030127004317, Accuracy = 0.7791411042944786
Iteration 1400: Cost = 0.3675696331356671, Accuracy = 0.7852760736196319
Iteration 1500: Cost = 0.3645840417022261, Accuracy = 0.803680981595092
Iteration 1600: Cost = 0.36127662530066484, Accuracy = 0.8466257668711656
Iteration 1700: Cost = 0.3571343604160831, Accuracy = 0.852760736196319
Iteration 1800: Cost = 0.3526980236572597, Accuracy = 0.8711656441717791
Iteration 1900: Cost = 0.3484630363540637, Accuracy = 0.8711656441717791
Iteration 2000: Cost = 0.34369176959870185, Accuracy = 0.8711656441717791
Iteration 2100: Cost = 0.33842245810259586, Accuracy = 0.8711656441717791
Iteration 2200: Cost = 0.33296683623267265, Accuracy = 0.8711656441717791
Iteration 2300: Cost = 0.32707243289285487, Accuracy = 0.8773006134969326
Iteration 2400: Cost = 0.3202156712118621, Accuracy = 0.8711656441717791
Iteration 2500: Cost = 0.31345579220835984, Accuracy = 0.8711656441717791
Iteration 2600: Cost = 0.30695384022815125, Accuracy = 0.8711656441717791
Iteration 2700: Cost = 0.3008522146520388, Accuracy = 0.8834355828220859
Iteration 2800: Cost = 0.2950480748849789, Accuracy = 0.8773006134969326
Iteration 2900: Cost = 0.28949665658448714, Accuracy = 0.8834355828220859
Iteration 3000: Cost = 0.2834331127774422, Accuracy = 0.8834355828220859
Iteration 3100: Cost = 0.2773402258747664, Accuracy = 0.8895705521472392
Iteration 3200: Cost = 0.27144698021478597, Accuracy = 0.8895705521472392
Iteration 3300: Cost = 0.2656990479338621, Accuracy = 0.8895705521472392
Iteration 3400: Cost = 0.2601293450956818, Accuracy = 0.8895705521472392
Iteration 3500: Cost = 0.2551955402907495, Accuracy = 0.8895705521472392
Iteration 3600: Cost = 0.2504756950788247, Accuracy = 0.8895705521472392
Iteration 3700: Cost = 0.24553061404690496, Accuracy = 0.8957055214723927
Iteration 3800: Cost = 0.24082289403705706, Accuracy = 0.8957055214723927
Iteration 3900: Cost = 0.23648196511212316, Accuracy = 0.8957055214723927
Iteration 4000: Cost = 0.23255818058559383, Accuracy = 0.8957055214723927
Iteration 4100: Cost = 0.2290045195524835, Accuracy = 0.901840490797546
Iteration 4200: Cost = 0.22554933520926515, Accuracy = 0.901840490797546
Iteration 4300: Cost = 0.22237283146422782, Accuracy = 0.901840490797546
Iteration 4400: Cost = 0.21956579476570934, Accuracy = 0.901840490797546
Iteration 4500: Cost = 0.2168706344585124, Accuracy = 0.9079754601226994
Iteration 4600: Cost = 0.21428226149736565, Accuracy = 0.9079754601226994
Iteration 4700: Cost = 0.21185730224835264, Accuracy = 0.9141104294478528
Iteration 4800: Cost = 0.2095142635001247, Accuracy = 0.9202453987730062
Iteration 4900: Cost = 0.20723912932558353, Accuracy = 0.9202453987730062
Iteration 5000: Cost = 0.20503671541111923, Accuracy = 0.9202453987730062

##################################
# Plotting cost and accuracy profiles
##################################
plt.figure(figsize=(15, 4.75))
plt.subplot(1, 2, 1)
plt.plot(range(iterations), costs)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('No Regularization: Cost Function by Iteration')

plt.subplot(1, 2, 2)
plt.plot(range(iterations), accuracies)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.title('No Regularization: Classification by Iteration')

plt.show()

##################################
# Plotting model weight profiles
##################################
num_layers = len(hidden_sizes) + 1
plt.figure(figsize=(15, 10))
for i in range(1, num_layers + 1):
    plt.subplot(2, num_layers // 2, i)
    plt.plot(range(iterations), [np.max(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Maximum Weight")
    plt.plot(range(iterations), [np.mean(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Mean Weight")
    plt.plot(range(iterations), [np.min(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Minimum Weight")
    plt.ylim([-1,5])
    plt.legend(loc="upper right")
    plt.xlabel('Iterations')
    plt.ylabel('Absolute Weight')
    plt.title(f'No Regularization: Layer {i} Weights by Iteration')

plt.show()

##################################
# Gathering the final accuracy, cost 
# and mean layer weight values for 
# No Regularization
##################################
NR_metrics = pd.DataFrame(["ACCURACY","LOSS","LAYER 1 MEAN WEIGHT","LAYER 2 MEAN WEIGHT","LAYER 3 MEAN WEIGHT","LAYER 4 MEAN WEIGHT"])
NR_values = pd.DataFrame([accuracies[-1],
                          costs[-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W1']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W2']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W3']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W4']][-1]])
NR_method = pd.DataFrame(["No Regularization"]*6)
NR_summary = pd.concat([NR_metrics, 
                        NR_values,
                        NR_method], axis=1)
NR_summary.columns = ['Metric', 'Value', 'Method']
NR_summary.reset_index(inplace=True, drop=True)
display(NR_summary)

##################################
# Updating model parameters
# with L1 regularization
##################################
def update_parameters_l1(parameters, gradients, learning_rate, lambd):
    for i in range(1, len(parameters) // 2 + 1):
        parameters[f'W{i}'] -= learning_rate * (gradients[f'dW{i}'] + lambd * np.sign(parameters[f'W{i}']))
        parameters[f'b{i}'] -= learning_rate * gradients[f'db{i}']
    return parameters

##################################
# Implementing neural network model training
# using L1 regularization
##################################

##################################
# Initializing training parameters
##################################
np.random.seed(88888)
parameters = initialize_parameters(input_size, hidden_sizes, output_size)

##################################
# Initializing regularization parameters
##################################
lambd = 0.01

##################################
# Creating lists to store cost and accuracy for plotting
##################################
costs = []
accuracies = []

##################################
# Creating lists to store weights for plotting
##################################
weight_history = {f'W{i}': [] for i in range(1, len(hidden_sizes) + 2)}

##################################
# Training a neural network model
# using L1 regularization
##################################
for i in range(iterations):
    # Implementing forward propagation
    cache = forward_propagation(matrix_x_values, parameters)
    Y_hat = cache[f'A{len(parameters) // 2}']
    
    # Computing cost and accuracy
    cost = compute_cost(y_values, Y_hat)
    accuracy = np.mean(np.argmax(y_values, axis=1) == np.argmax(Y_hat, axis=1))
    
    # Implementing backward propagation
    gradients = backward_propagation(matrix_x_values, y_values, parameters, cache)
    
    # Updating model parameter values
    parameters = update_parameters_l1(parameters, gradients, learning_rate, lambd)
    
    # Recording model weight values
    for j in range(1, len(hidden_sizes) + 2):
        weight_history[f'W{j}'].append(parameters[f'W{j}'].copy())
    
    # Recording cost and accuracy values
    costs.append(cost)
    accuracies.append(accuracy)
    
    # Printing cost and accuracy every 100 iterations
    if i % 100 == 0:
        print(f"Iteration {i}: Cost = {cost}, Accuracy = {accuracy}")

Iteration 0: Cost = 0.6655730513372478, Accuracy = 0.7484662576687117
Iteration 100: Cost = 0.4462035807404296, Accuracy = 0.7484662576687117
Iteration 200: Cost = 0.43307112491484734, Accuracy = 0.7484662576687117
Iteration 300: Cost = 0.4245753603648683, Accuracy = 0.7484662576687117
Iteration 400: Cost = 0.41765452160991584, Accuracy = 0.7484662576687117
Iteration 500: Cost = 0.41170725036511086, Accuracy = 0.7484662576687117
Iteration 600: Cost = 0.4063499607644956, Accuracy = 0.7484662576687117
Iteration 700: Cost = 0.40141996514044564, Accuracy = 0.7484662576687117
Iteration 800: Cost = 0.39695705094036426, Accuracy = 0.7484662576687117
Iteration 900: Cost = 0.3928552967688034, Accuracy = 0.7484662576687117
Iteration 1000: Cost = 0.3891339343868409, Accuracy = 0.7484662576687117
Iteration 1100: Cost = 0.3857114804338176, Accuracy = 0.7484662576687117
Iteration 1200: Cost = 0.3825210047372746, Accuracy = 0.7914110429447853
Iteration 1300: Cost = 0.37937159620923677, Accuracy = 0.803680981595092
Iteration 1400: Cost = 0.3764264838052875, Accuracy = 0.803680981595092
Iteration 1500: Cost = 0.37341353374383973, Accuracy = 0.803680981595092
Iteration 1600: Cost = 0.3704330890482548, Accuracy = 0.8098159509202454
Iteration 1700: Cost = 0.3677665432448274, Accuracy = 0.8098159509202454
Iteration 1800: Cost = 0.36536349322864836, Accuracy = 0.8159509202453987
Iteration 1900: Cost = 0.36310828716407484, Accuracy = 0.803680981595092
Iteration 2000: Cost = 0.36095044411591176, Accuracy = 0.8159509202453987
Iteration 2100: Cost = 0.35886339245233684, Accuracy = 0.8466257668711656
Iteration 2200: Cost = 0.35669447205475463, Accuracy = 0.8588957055214724
Iteration 2300: Cost = 0.35434982841820356, Accuracy = 0.8650306748466258
Iteration 2400: Cost = 0.3519215411585186, Accuracy = 0.8773006134969326
Iteration 2500: Cost = 0.3495143992788678, Accuracy = 0.8834355828220859
Iteration 2600: Cost = 0.3470638445163202, Accuracy = 0.8834355828220859
Iteration 2700: Cost = 0.3440257857333663, Accuracy = 0.8834355828220859
Iteration 2800: Cost = 0.34045465628368254, Accuracy = 0.8895705521472392
Iteration 2900: Cost = 0.3363334032414547, Accuracy = 0.8895705521472392
Iteration 3000: Cost = 0.33174441710196806, Accuracy = 0.8895705521472392
Iteration 3100: Cost = 0.32646060009702393, Accuracy = 0.8957055214723927
Iteration 3200: Cost = 0.32077767802742907, Accuracy = 0.901840490797546
Iteration 3300: Cost = 0.31478412458414284, Accuracy = 0.901840490797546
Iteration 3400: Cost = 0.30786859885321166, Accuracy = 0.901840490797546
Iteration 3500: Cost = 0.30064014725686056, Accuracy = 0.901840490797546
Iteration 3600: Cost = 0.29328994308362044, Accuracy = 0.9079754601226994
Iteration 3700: Cost = 0.2858443391266973, Accuracy = 0.9079754601226994
Iteration 3800: Cost = 0.2782177297063182, Accuracy = 0.9079754601226994
Iteration 3900: Cost = 0.26988386077770943, Accuracy = 0.9141104294478528
Iteration 4000: Cost = 0.26115813096813184, Accuracy = 0.9141104294478528
Iteration 4100: Cost = 0.25289715539151114, Accuracy = 0.9141104294478528
Iteration 4200: Cost = 0.2451885812355063, Accuracy = 0.9079754601226994
Iteration 4300: Cost = 0.23709745606369595, Accuracy = 0.9079754601226994
Iteration 4400: Cost = 0.22830136180962884, Accuracy = 0.9079754601226994
Iteration 4500: Cost = 0.21929749566121318, Accuracy = 0.9079754601226994
Iteration 4600: Cost = 0.21094748273654212, Accuracy = 0.9141104294478528
Iteration 4700: Cost = 0.20291314146180195, Accuracy = 0.9202453987730062
Iteration 4800: Cost = 0.1948127609319005, Accuracy = 0.9263803680981595
Iteration 4900: Cost = 0.18713571925816344, Accuracy = 0.9386503067484663
Iteration 5000: Cost = 0.17995911258805278, Accuracy = 0.9447852760736196

##################################
# Plotting cost and accuracy profiles
##################################
plt.figure(figsize=(15, 4.75))
plt.subplot(1, 2, 1)
plt.plot(range(iterations), costs)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('L1 Regularization: Cost Function by Iteration')

plt.subplot(1, 2, 2)
plt.plot(range(iterations), accuracies)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.title('L1 Regularization: Classification by Iteration')

plt.show()

##################################
# Plotting model weight profiles
##################################
num_layers = len(hidden_sizes) + 1
plt.figure(figsize=(15, 10))
for i in range(1, num_layers + 1):
    plt.subplot(2, num_layers // 2, i)
    plt.plot(range(iterations), [np.max(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Maximum Weight")
    plt.plot(range(iterations), [np.mean(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Mean Weight")
    plt.plot(range(iterations), [np.min(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Minimum Weight")
    plt.ylim([-1,5])
    plt.legend(loc="upper right")
    plt.xlabel('Iterations')
    plt.ylabel('Absolute Weight')
    plt.title(f'L1 Regularization: Layer {i} Weights by Iteration')

plt.show()

##################################
# Gathering the final accuracy, cost 
# and mean layer weight values for 
# L1 Regularization
##################################
L1R_metrics = pd.DataFrame(["ACCURACY","LOSS","LAYER 1 MEAN WEIGHT","LAYER 2 MEAN WEIGHT","LAYER 3 MEAN WEIGHT","LAYER 4 MEAN WEIGHT"])
L1R_values = pd.DataFrame([accuracies[-1],
                          costs[-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W1']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W2']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W3']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W4']][-1]])
L1R_method = pd.DataFrame(["L1 Regularization"]*6)
L1R_summary = pd.concat([L1R_metrics, 
                         L1R_values,
                         L1R_method], axis=1)
L1R_summary.columns = ['Metric', 'Value', 'Method']
L1R_summary.reset_index(inplace=True, drop=True)
display(L1R_summary)

##################################
# Updating model parameters
# with L2 regularization
##################################
def update_parameters_l2(parameters, gradients, learning_rate, lambd):
    for i in range(1, len(parameters) // 2 + 1):
        parameters[f'W{i}'] -= learning_rate * (gradients[f'dW{i}'] + lambd * parameters[f'W{i}'])
        parameters[f'b{i}'] -= learning_rate * gradients[f'db{i}']
    return parameters

##################################
# Implementing neural network model training
# using L2 regularization
##################################

##################################
# Initializing training parameters
##################################
np.random.seed(88888)
parameters = initialize_parameters(input_size, hidden_sizes, output_size)

##################################
# Initializing regularization parameters
##################################
lambd = 0.01

##################################
# Creating lists to store cost and accuracy for plotting
##################################
costs = []
accuracies = []

##################################
# Creating lists to store weights for plotting
##################################
weight_history = {f'W{i}': [] for i in range(1, len(hidden_sizes) + 2)}

##################################
# Training a neural network model
# using L2 regularization
##################################
for i in range(iterations):
    # Implementing forward propagation
    cache = forward_propagation(matrix_x_values, parameters)
    Y_hat = cache[f'A{len(parameters) // 2}']
    
    # Computing cost and accuracy
    cost = compute_cost(y_values, Y_hat)
    accuracy = np.mean(np.argmax(y_values, axis=1) == np.argmax(Y_hat, axis=1))
    
    # Implementing backward propagation
    gradients = backward_propagation(matrix_x_values, y_values, parameters, cache)
    
    # Updating model parameter values
    parameters = update_parameters_l2(parameters, gradients, learning_rate, lambd)
    
    # Recording model weight values
    for j in range(1, len(hidden_sizes) + 2):
        weight_history[f'W{j}'].append(parameters[f'W{j}'].copy())
    
    # Recording cost and accuracy values
    costs.append(cost)
    accuracies.append(accuracy)
    
    # Printing cost and accuracy every 100 iterations
    if i % 100 == 0:
        print(f"Iteration {i}: Cost = {cost}, Accuracy = {accuracy}")

Iteration 0: Cost = 0.6655730513372478, Accuracy = 0.7484662576687117
Iteration 100: Cost = 0.44429964465388627, Accuracy = 0.7484662576687117
Iteration 200: Cost = 0.43045877070958505, Accuracy = 0.7484662576687117
Iteration 300: Cost = 0.42162536388128696, Accuracy = 0.7484662576687117
Iteration 400: Cost = 0.4142955687599988, Accuracy = 0.7484662576687117
Iteration 500: Cost = 0.4078085106663453, Accuracy = 0.7484662576687117
Iteration 600: Cost = 0.40197085423396267, Accuracy = 0.7484662576687117
Iteration 700: Cost = 0.39684543102160064, Accuracy = 0.7484662576687117
Iteration 800: Cost = 0.39203711166365196, Accuracy = 0.7484662576687117
Iteration 900: Cost = 0.38778015593107573, Accuracy = 0.7484662576687117
Iteration 1000: Cost = 0.383955988136414, Accuracy = 0.7484662576687117
Iteration 1100: Cost = 0.38044959963623676, Accuracy = 0.7975460122699386
Iteration 1200: Cost = 0.3771724998467772, Accuracy = 0.7914110429447853
Iteration 1300: Cost = 0.37407878902293, Accuracy = 0.7852760736196319
Iteration 1400: Cost = 0.37115549852467994, Accuracy = 0.7975460122699386
Iteration 1500: Cost = 0.3683436835754613, Accuracy = 0.7791411042944786
Iteration 1600: Cost = 0.36550336906190745, Accuracy = 0.7791411042944786
Iteration 1700: Cost = 0.36275189336896685, Accuracy = 0.7852760736196319
Iteration 1800: Cost = 0.35973982742097, Accuracy = 0.8098159509202454
Iteration 1900: Cost = 0.3566011957815566, Accuracy = 0.8404907975460123
Iteration 2000: Cost = 0.352887209414023, Accuracy = 0.852760736196319
Iteration 2100: Cost = 0.3484851328160455, Accuracy = 0.8711656441717791
Iteration 2200: Cost = 0.34329588707424163, Accuracy = 0.8711656441717791
Iteration 2300: Cost = 0.33760321099236623, Accuracy = 0.8711656441717791
Iteration 2400: Cost = 0.3318094925708932, Accuracy = 0.8834355828220859
Iteration 2500: Cost = 0.32616023767700586, Accuracy = 0.8834355828220859
Iteration 2600: Cost = 0.32040146696162614, Accuracy = 0.8834355828220859
Iteration 2700: Cost = 0.3144087304921743, Accuracy = 0.8895705521472392
Iteration 2800: Cost = 0.30820588510561353, Accuracy = 0.8895705521472392
Iteration 2900: Cost = 0.3015916928170157, Accuracy = 0.8895705521472392
Iteration 3000: Cost = 0.29446004882991256, Accuracy = 0.901840490797546
Iteration 3100: Cost = 0.2875654441600376, Accuracy = 0.901840490797546
Iteration 3200: Cost = 0.2807765490681191, Accuracy = 0.901840490797546
Iteration 3300: Cost = 0.27309660138163006, Accuracy = 0.901840490797546
Iteration 3400: Cost = 0.2645481064228508, Accuracy = 0.9079754601226994
Iteration 3500: Cost = 0.25538936654757927, Accuracy = 0.9141104294478528
Iteration 3600: Cost = 0.24706959785215937, Accuracy = 0.9141104294478528
Iteration 3700: Cost = 0.23946139205738443, Accuracy = 0.9079754601226994
Iteration 3800: Cost = 0.23268259552029696, Accuracy = 0.9079754601226994
Iteration 3900: Cost = 0.22675946408653966, Accuracy = 0.9079754601226994
Iteration 4000: Cost = 0.22160929364688506, Accuracy = 0.9141104294478528
Iteration 4100: Cost = 0.21678888835861884, Accuracy = 0.9141104294478528
Iteration 4200: Cost = 0.21222808750927571, Accuracy = 0.9141104294478528
Iteration 4300: Cost = 0.2079227888760176, Accuracy = 0.9141104294478528
Iteration 4400: Cost = 0.20393108319997055, Accuracy = 0.9202453987730062
Iteration 4500: Cost = 0.20016016168692866, Accuracy = 0.9263803680981595
Iteration 4600: Cost = 0.19424766743460561, Accuracy = 0.9202453987730062
Iteration 4700: Cost = 0.18832681151391784, Accuracy = 0.9325153374233128
Iteration 4800: Cost = 0.18382694450847586, Accuracy = 0.9325153374233128
Iteration 4900: Cost = 0.179086904697276, Accuracy = 0.9447852760736196
Iteration 5000: Cost = 0.17509776853693604, Accuracy = 0.9447852760736196

##################################
# Plotting cost and accuracy profiles
##################################
plt.figure(figsize=(15, 4.75))
plt.subplot(1, 2, 1)
plt.plot(range(iterations), costs)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('L2 Regularization: Cost Function by Iteration')

plt.subplot(1, 2, 2)
plt.plot(range(iterations), accuracies)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.title('L2 Regularization: Classification by Iteration')

plt.show()

##################################
# Plotting model weight profiles
##################################
num_layers = len(hidden_sizes) + 1
plt.figure(figsize=(15, 10))
for i in range(1, num_layers + 1):
    plt.subplot(2, num_layers // 2, i)
    plt.plot(range(iterations), [np.max(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Maximum Weight")
    plt.plot(range(iterations), [np.mean(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Mean Weight")
    plt.plot(range(iterations), [np.min(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Minimum Weight")
    plt.ylim([-1,5])
    plt.legend(loc="upper right")
    plt.xlabel('Iterations')
    plt.ylabel('Absolute Weight')
    plt.title(f'L2 Regularization: Layer {i} Weights by Iteration')

plt.show()

##################################
# Gathering the final accuracy, cost 
# and mean layer weight values for 
# L2 Regularization
##################################
L2R_metrics = pd.DataFrame(["ACCURACY","LOSS","LAYER 1 MEAN WEIGHT","LAYER 2 MEAN WEIGHT","LAYER 3 MEAN WEIGHT","LAYER 4 MEAN WEIGHT"])
L2R_values = pd.DataFrame([accuracies[-1],
                          costs[-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W1']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W2']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W3']][-1],
                          [np.mean(np.abs(weights)) for weights in weight_history['W4']][-1]])
L2R_method = pd.DataFrame(["L2 Regularization"]*6)
L2R_summary = pd.concat([L2R_metrics, 
                         L2R_values,
                         L2R_method], axis=1)
L2R_summary.columns = ['Metric', 'Value', 'Method']
L2R_summary.reset_index(inplace=True, drop=True)
display(L2R_summary)

##################################
# Updating model parameters
# with ElasticNet regularization
##################################
def update_parameters_elastic(parameters, gradients, learning_rate, lambd):
    for i in range(1, len(parameters) // 2 + 1):
        parameters[f'W{i}'] -= learning_rate * (gradients[f'dW{i}'] + lambd * parameters[f'W{i}'] + lambd * np.sign(parameters[f'W{i}']))
        parameters[f'b{i}'] -= learning_rate * gradients[f'db{i}']
    return parameters

##################################
# Implementing neural network model training
# with ElasticNet regularization
##################################

##################################
# Initializing training parameters
##################################
np.random.seed(88888)
parameters = initialize_parameters(input_size, hidden_sizes, output_size)

##################################
# Initializing regularization parameters
##################################
lambd = 0.01

##################################
# Creating lists to store cost and accuracy for plotting
##################################
costs = []
accuracies = []

##################################
# Creating lists to store weights for plotting
##################################
weight_history = {f'W{i}': [] for i in range(1, len(hidden_sizes) + 2)}

##################################
# Training a neural network model
# using ElasticNet regularization
##################################
for i in range(iterations):
    # Implementing forward propagation
    cache = forward_propagation(matrix_x_values, parameters)
    Y_hat = cache[f'A{len(parameters) // 2}']
    
    # Computing cost and accuracy
    cost = compute_cost(y_values, Y_hat)
    accuracy = np.mean(np.argmax(y_values, axis=1) == np.argmax(Y_hat, axis=1))
    
    # Implementing backward propagation
    gradients = backward_propagation(matrix_x_values, y_values, parameters, cache)
    
    # Updating model parameter values
    parameters = update_parameters_elastic(parameters, gradients, learning_rate, lambd)
    
    # Recording model weight values
    for j in range(1, len(hidden_sizes) + 2):
        weight_history[f'W{j}'].append(parameters[f'W{j}'].copy())
    
    # Recording cost and accuracy values
    costs.append(cost)
    accuracies.append(accuracy)
    
    # Printing cost and accuracy every 100 iterations
    if i % 100 == 0:
        print(f"Iteration {i}: Cost = {cost}, Accuracy = {accuracy}")

Iteration 0: Cost = 0.6655730513372478, Accuracy = 0.7484662576687117
Iteration 100: Cost = 0.4471882556379198, Accuracy = 0.7484662576687117
Iteration 200: Cost = 0.4340059474063969, Accuracy = 0.7484662576687117
Iteration 300: Cost = 0.42572286715691854, Accuracy = 0.7484662576687117
Iteration 400: Cost = 0.41905169319864527, Accuracy = 0.7484662576687117
Iteration 500: Cost = 0.4134013298265906, Accuracy = 0.7484662576687117
Iteration 600: Cost = 0.4083585637233199, Accuracy = 0.7484662576687117
Iteration 700: Cost = 0.4037088551833828, Accuracy = 0.7484662576687117
Iteration 800: Cost = 0.39966601460497914, Accuracy = 0.7484662576687117
Iteration 900: Cost = 0.39594057562593066, Accuracy = 0.7484662576687117
Iteration 1000: Cost = 0.3925139486505437, Accuracy = 0.7484662576687117
Iteration 1100: Cost = 0.38909789119889376, Accuracy = 0.7484662576687117
Iteration 1200: Cost = 0.38596483796119213, Accuracy = 0.7484662576687117
Iteration 1300: Cost = 0.3829541535910757, Accuracy = 0.7791411042944786
Iteration 1400: Cost = 0.3800650037719093, Accuracy = 0.7791411042944786
Iteration 1500: Cost = 0.37742536144848615, Accuracy = 0.7791411042944786
Iteration 1600: Cost = 0.37489632581197785, Accuracy = 0.803680981595092
Iteration 1700: Cost = 0.37241872084936734, Accuracy = 0.803680981595092
Iteration 1800: Cost = 0.36997305683060216, Accuracy = 0.8098159509202454
Iteration 1900: Cost = 0.3675035676569522, Accuracy = 0.8159509202453987
Iteration 2000: Cost = 0.36482829140578205, Accuracy = 0.7975460122699386
Iteration 2100: Cost = 0.3620304440423624, Accuracy = 0.8159509202453987
Iteration 2200: Cost = 0.3591947322612712, Accuracy = 0.8588957055214724
Iteration 2300: Cost = 0.3563843202170577, Accuracy = 0.8711656441717791
Iteration 2400: Cost = 0.3532542428735755, Accuracy = 0.8834355828220859
Iteration 2500: Cost = 0.35006581551657995, Accuracy = 0.8588957055214724
Iteration 2600: Cost = 0.34637500948900574, Accuracy = 0.8650306748466258
Iteration 2700: Cost = 0.3423694185923936, Accuracy = 0.8650306748466258
Iteration 2800: Cost = 0.3383693028198383, Accuracy = 0.8650306748466258
Iteration 2900: Cost = 0.3334202257534935, Accuracy = 0.8650306748466258
Iteration 3000: Cost = 0.32143961547873967, Accuracy = 0.8711656441717791
Iteration 3100: Cost = 0.3091981114034324, Accuracy = 0.8773006134969326
Iteration 3200: Cost = 0.2980401709962735, Accuracy = 0.8773006134969326
Iteration 3300: Cost = 0.28714421049404465, Accuracy = 0.8773006134969326
Iteration 3400: Cost = 0.27621050170812617, Accuracy = 0.8773006134969326
Iteration 3500: Cost = 0.26495607009773003, Accuracy = 0.8834355828220859
Iteration 3600: Cost = 0.2533825501636212, Accuracy = 0.9263803680981595
Iteration 3700: Cost = 0.24204667334651384, Accuracy = 0.9263803680981595
Iteration 3800: Cost = 0.2302050916179891, Accuracy = 0.9263803680981595
Iteration 3900: Cost = 0.21841093625348015, Accuracy = 0.9325153374233128
Iteration 4000: Cost = 0.20712519653582573, Accuracy = 0.9447852760736196
Iteration 4100: Cost = 0.19557938691044466, Accuracy = 0.950920245398773
Iteration 4200: Cost = 0.18458061334275247, Accuracy = 0.950920245398773
Iteration 4300: Cost = 0.17151486813077457, Accuracy = 0.9570552147239264
Iteration 4400: Cost = 0.15857087106473372, Accuracy = 0.9693251533742331
Iteration 4500: Cost = 0.1480931620421723, Accuracy = 0.9693251533742331
Iteration 4600: Cost = 0.13931546350255852, Accuracy = 0.9693251533742331
Iteration 4700: Cost = 0.1304569791815692, Accuracy = 0.9693251533742331
Iteration 4800: Cost = 0.12228786239597626, Accuracy = 0.9754601226993865
Iteration 4900: Cost = 0.11616751236073691, Accuracy = 0.9693251533742331
Iteration 5000: Cost = 0.109621599701186, Accuracy = 0.9693251533742331

##################################
# Plotting cost and accuracy profiles
##################################
plt.figure(figsize=(15, 4.75))
plt.subplot(1, 2, 1)
plt.plot(range(iterations), costs)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('ElasticNet Regularization: Cost Function by Iteration')

plt.subplot(1, 2, 2)
plt.plot(range(iterations), accuracies)
plt.ylim([0,1])
plt.xlabel('Iterations')
plt.ylabel('Accuracy')
plt.title('ElasticNet Regularization: Classification by Iteration')

plt.show()

##################################
# Plotting model weight profiles
##################################
num_layers = len(hidden_sizes) + 1
plt.figure(figsize=(15, 10))
for i in range(1, num_layers + 1):
    plt.subplot(2, num_layers // 2, i)
    plt.plot(range(iterations), [np.max(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Maximum Weight")
    plt.plot(range(iterations), [np.mean(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Mean Weight")
    plt.plot(range(iterations), [np.min(np.abs(weights)) for weights in weight_history[f'W{i}']], label="Minimum Weight")
    plt.ylim([-1,5])
    plt.legend(loc="upper right")
    plt.xlabel('Iterations')
    plt.ylabel('Absolute Weight')
    plt.title(f'ElasticNet Regularization: Layer {i} Weights by Iteration')

plt.show()

##################################
# Gathering the final accuracy, cost 
# and mean layer weight values for 
# ElasticNet Regularization
##################################
ENR_metrics = pd.DataFrame(["ACCURACY","LOSS","LAYER 1 MEAN WEIGHT","LAYER 2 MEAN WEIGHT","LAYER 3 MEAN WEIGHT","LAYER 4 MEAN WEIGHT"])
ENR_values = pd.DataFrame([accuracies[-1],
                           costs[-1],
                           [np.mean(np.abs(weights)) for weights in weight_history['W1']][-1],
                           [np.mean(np.abs(weights)) for weights in weight_history['W2']][-1],
                           [np.mean(np.abs(weights)) for weights in weight_history['W3']][-1],
                           [np.mean(np.abs(weights)) for weights in weight_history['W4']][-1]])
ENR_method = pd.DataFrame(["ElasticNet Regularization"]*6)
ENR_summary = pd.concat([ENR_metrics, 
                         ENR_values,
                         ENR_method], axis=1)
ENR_summary.columns = ['Metric', 'Value', 'Method']
ENR_summary.reset_index(inplace=True, drop=True)
display(ENR_summary)

##################################
# Consolidating all the
# model performance metrics
##################################
model_performance_comparison = pd.concat([NR_summary, 
                                          L1R_summary,
                                          L2R_summary, 
                                          ENR_summary], 
                                         ignore_index=True)
print('Neural Network Model Comparison: ')
display(model_performance_comparison)

Neural Network Model Comparison:

##################################
# Consolidating the values for the
# accuracy metrics
# for all models
##################################
model_performance_comparison_accuracy = model_performance_comparison[model_performance_comparison['Metric']=='ACCURACY']
model_performance_comparison_accuracy.reset_index(inplace=True, drop=True)
model_performance_comparison_accuracy

##################################
# Plotting the values for the
# accuracy metrics
# for all models
##################################
fig, ax = plt.subplots(figsize=(7, 7))
accuracy_hbar = ax.barh(model_performance_comparison_accuracy['Method'], model_performance_comparison_accuracy['Value'])
ax.set_xlabel("Accuracy")
ax.set_ylabel("Neural Network Classification Models")
ax.bar_label(accuracy_hbar, fmt='%.5f', padding=-50, color='white', fontweight='bold')
ax.set_xlim(0,1)
plt.show()

##################################
# Consolidating the values for the
# logarithmic loss error metrics
# for all models
##################################
model_performance_comparison_loss = model_performance_comparison[model_performance_comparison['Metric']=='LOSS']
model_performance_comparison_loss.reset_index(inplace=True, drop=True)
model_performance_comparison_loss

##################################
# Plotting the values for the
# loss error
# for all models
##################################
fig, ax = plt.subplots(figsize=(7, 7))
loss_hbar = ax.barh(model_performance_comparison_loss['Method'], model_performance_comparison_loss['Value'])
ax.set_xlabel("Loss Error")
ax.set_ylabel("Neural Network Classification Models")
ax.bar_label(loss_hbar, fmt='%.5f', padding=-50, color='white', fontweight='bold')
ax.set_xlim(0,0.25)
plt.show()

##################################
# Consolidating the mean weights
# for all models
##################################
weight_labels = ['LAYER 1 MEAN WEIGHT','LAYER 2 MEAN WEIGHT','LAYER 3 MEAN WEIGHT','LAYER 4 MEAN WEIGHT']
NR_weights = model_performance_comparison[((model_performance_comparison['Metric'] == 'LAYER 1 MEAN WEIGHT') |
                                           (model_performance_comparison['Metric'] == 'LAYER 2 MEAN WEIGHT') |
                                           (model_performance_comparison['Metric'] == 'LAYER 3 MEAN WEIGHT') |
                                           (model_performance_comparison['Metric'] == 'LAYER 4 MEAN WEIGHT')) & 
                                           (model_performance_comparison['Method']=='No Regularization')]['Value'].values

L1R_weights = model_performance_comparison[((model_performance_comparison['Metric'] == 'LAYER 1 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 2 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 3 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 4 MEAN WEIGHT')) & 
                                            (model_performance_comparison['Method']=='L1 Regularization')]['Value'].values

L2R_weights = model_performance_comparison[((model_performance_comparison['Metric'] == 'LAYER 1 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 2 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 3 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 4 MEAN WEIGHT')) & 
                                            (model_performance_comparison['Method']=='L2 Regularization')]['Value'].values

ENR_weights = model_performance_comparison[((model_performance_comparison['Metric'] == 'LAYER 1 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 2 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 3 MEAN WEIGHT') |
                                            (model_performance_comparison['Metric'] == 'LAYER 4 MEAN WEIGHT')) & 
                                            (model_performance_comparison['Method']=='ElasticNet Regularization')]['Value'].values

##################################
# Plotting the values for the
# mean weights
# for all models
##################################
NN_layer_mean_weight_plot = pd.DataFrame({'No Regularization': list(NR_weights),
                                          'L1 Regularization': list(L1R_weights),
                                          'L2 Regularization': list(L2R_weights),
                                          'ElasticNet Regularization': list(ENR_weights)},
                                         index=['LAYER 1 MEAN WEIGHT','LAYER 2 MEAN WEIGHT','LAYER 3 MEAN WEIGHT','LAYER 4 MEAN WEIGHT'])
NN_layer_mean_weight_plot

##################################
# Plotting all the mean weights
# for all models
##################################
NN_layer_mean_weight_plot = NN_layer_mean_weight_plot.plot.barh(figsize=(10, 6), width=0.90)
NN_layer_mean_weight_plot.set_xlim(0.00,1.25)
NN_layer_mean_weight_plot.set_title("Model Comparison by Neural Network Layer Mean Weights")
NN_layer_mean_weight_plot.set_xlabel("Absolute Weight")
NN_layer_mean_weight_plot.set_ylabel("Regularization Conditions")
NN_layer_mean_weight_plot.grid(False)
NN_layer_mean_weight_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in NN_layer_mean_weight_plot.containers:
    NN_layer_mean_weight_plot.bar_label(container, fmt='%.5f', padding=-50, color='white', fontweight='bold')

from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))

	COUNTRY	CANRAT	GDPPER	URBPOP	PATRES	RNDGDP	POPGRO	LIFEXP	TUBINC	DTHCMD	...	RELOUT	METEMI	FORARE	CO2EMI	PM2EXP	POPDEN	ENRTER	GDPCAP	HDICAT	EPISCO
0	Australia	High	98380.63601	86.241	2368.0	NaN	1.235701	83.200000	7.2	4.941054	...	13.637841	131484.763200	17.421315	14.772658	24.893584	3.335312	110.139221	51722.06900	VH	60.1
1	New Zealand	High	77541.76438	86.699	348.0	NaN	2.204789	82.256098	7.2	4.354730	...	80.081439	32241.937000	37.570126	6.160799	NaN	19.331586	75.734833	41760.59478	VH	56.7
2	Ireland	High	198405.87500	63.653	75.0	1.23244	1.029111	82.556098	5.3	5.684596	...	27.965408	15252.824630	11.351720	6.768228	0.274092	72.367281	74.680313	85420.19086	VH	57.4
3	United States	High	130941.63690	82.664	269586.0	3.42287	0.964348	76.980488	2.3	5.302060	...	13.228593	748241.402900	33.866926	13.032828	3.343170	36.240985	87.567657	63528.63430	VH	51.1
4	Denmark	High	113300.60110	88.116	1261.0	2.96873	0.291641	81.602439	4.1	6.826140	...	65.505925	7778.773921	15.711000	4.691237	56.914456	145.785100	82.664330	60915.42440	VH	77.9

	count	mean	std	min	25%	50%	75%	max
GDPPER	165.0	45284.424283	3.941794e+04	1718.804896	13545.254510	34024.900890	66778.416050	2.346469e+05
URBPOP	174.0	59.788121	2.280640e+01	13.345000	42.432750	61.701500	79.186500	1.000000e+02
PATRES	108.0	20607.388889	1.340683e+05	1.000000	35.250000	244.500000	1297.750000	1.344817e+06
RNDGDP	74.0	1.197474	1.189956e+00	0.039770	0.256372	0.873660	1.608842	5.354510e+00
POPGRO	174.0	1.127028	1.197718e+00	-2.079337	0.236900	1.179959	2.031154	3.727101e+00
LIFEXP	174.0	71.746113	7.606209e+00	52.777000	65.907500	72.464610	77.523500	8.456000e+01
TUBINC	174.0	105.005862	1.367229e+02	0.770000	12.000000	44.500000	147.750000	5.920000e+02
DTHCMD	170.0	21.260521	1.927333e+01	1.283611	6.078009	12.456279	36.980457	6.520789e+01
AGRLND	174.0	38.793456	2.171551e+01	0.512821	20.130276	40.386649	54.013754	8.084112e+01
GHGEMI	170.0	259582.709895	1.118550e+06	179.725150	12527.487367	41009.275980	116482.578575	1.294287e+07
RELOUT	153.0	39.760036	3.191492e+01	0.000296	10.582691	32.381668	63.011450	1.000000e+02
METEMI	170.0	47876.133575	1.346611e+05	11.596147	3662.884908	11118.976025	32368.909040	1.186285e+06
FORARE	173.0	32.218177	2.312001e+01	0.008078	11.604388	31.509048	49.071780	9.741212e+01
CO2EMI	170.0	3.751097	4.606479e+00	0.032585	0.631924	2.298368	4.823496	3.172684e+01
PM2EXP	167.0	91.940595	2.206003e+01	0.274092	99.627134	100.000000	100.000000	1.000000e+02
POPDEN	174.0	200.886765	6.453834e+02	2.115134	27.454539	77.983133	153.993650	7.918951e+03
ENRTER	116.0	49.994997	2.970619e+01	2.432581	22.107195	53.392460	71.057467	1.433107e+02
GDPCAP	170.0	13992.095610	1.957954e+04	216.827417	1870.503029	5348.192875	17421.116227	1.173705e+05
EPISCO	165.0	42.946667	1.249086e+01	18.900000	33.000000	40.900000	50.500000	7.790000e+01

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	Australia	22	1	0.045455
1	New Zealand	22	2	0.090909
2	Ireland	22	0	0.000000
3	United States	22	0	0.000000
4	Denmark	22	0	0.000000
...	...	...	...	...
172	Congo Republic	22	3	0.136364
173	Bhutan	22	2	0.090909
174	Nepal	22	2	0.090909
175	Gambia	22	4	0.181818
176	Niger	22	2	0.090909

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
0	GDPPER	1718.804896	45284.424283	34024.900890	2.346469e+05	98380.636010	77541.764380	1	1	1.000000	165	177	0.932203	1.517574	3.471992
1	URBPOP	13.345000	59.788121	61.701500	1.000000e+02	100.000000	86.699000	2	1	2.000000	173	177	0.977401	-0.210702	-0.962847
2	PATRES	1.000000	20607.388889	244.500000	1.344817e+06	6.000000	2.000000	4	3	1.333333	97	177	0.548023	9.284436	91.187178
3	RNDGDP	0.039770	1.197474	0.873660	5.354510e+00	1.232440	3.422870	1	1	1.000000	74	177	0.418079	1.396742	1.695957
4	POPGRO	-2.079337	1.127028	1.179959	3.727101e+00	1.235701	2.204789	1	1	1.000000	174	177	0.983051	-0.195161	-0.423580
5	LIFEXP	52.777000	71.746113	72.464610	8.456000e+01	83.200000	82.256098	1	1	1.000000	174	177	0.983051	-0.357965	-0.649601
6	TUBINC	0.770000	105.005862	44.500000	5.920000e+02	12.000000	4.100000	4	3	1.333333	131	177	0.740113	1.746333	2.429368
7	DTHCMD	1.283611	21.260521	12.456279	6.520789e+01	4.941054	4.354730	1	1	1.000000	170	177	0.960452	0.900509	-0.691541
8	AGRLND	0.512821	38.793456	40.386649	8.084112e+01	46.252480	38.562911	1	1	1.000000	174	177	0.983051	0.074000	-0.926249
9	GHGEMI	179.725150	259582.709895	41009.275980	1.294287e+07	571903.119900	80158.025830	1	1	1.000000	170	177	0.960452	9.496120	101.637308
10	RELOUT	0.000296	39.760036	32.381668	1.000000e+02	100.000000	80.081439	3	1	3.000000	151	177	0.853107	0.501088	-0.981774
11	METEMI	11.596147	47876.133575	11118.976025	1.186285e+06	131484.763200	32241.937000	1	1	1.000000	170	177	0.960452	5.801014	38.661386
12	FORARE	0.008078	32.218177	31.509048	9.741212e+01	17.421315	37.570126	1	1	1.000000	173	177	0.977401	0.519277	-0.322589
13	CO2EMI	0.032585	3.751097	2.298368	3.172684e+01	14.772658	6.160799	1	1	1.000000	170	177	0.960452	2.721552	10.311574
14	PM2EXP	0.274092	91.940595	100.000000	1.000000e+02	100.000000	100.000000	106	2	53.000000	61	177	0.344633	-3.141557	9.032386
15	POPDEN	2.115134	200.886765	77.983133	7.918951e+03	3.335312	19.331586	1	1	1.000000	174	177	0.983051	10.267750	119.995256
16	ENRTER	2.432581	49.994997	53.392460	1.433107e+02	110.139221	75.734833	1	1	1.000000	116	177	0.655367	0.275863	-0.392895
17	GDPCAP	216.827417	13992.095610	5348.192875	1.173705e+05	51722.069000	41760.594780	1	1	1.000000	170	177	0.960452	2.258568	5.938690
18	EPISCO	18.900000	42.946667	40.900000	7.790000e+01	29.600000	43.600000	3	3	1.000000	137	177	0.774011	0.641799	0.035208

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
15	POPDEN	2.115134	200.886765	77.983133	7.918951e+03	3.335312	19.331586	1	1	1.000000	174	177	0.983051	10.267750	119.995256
9	GHGEMI	179.725150	259582.709895	41009.275980	1.294287e+07	571903.119900	80158.025830	1	1	1.000000	170	177	0.960452	9.496120	101.637308
2	PATRES	1.000000	20607.388889	244.500000	1.344817e+06	6.000000	2.000000	4	3	1.333333	97	177	0.548023	9.284436	91.187178
11	METEMI	11.596147	47876.133575	11118.976025	1.186285e+06	131484.763200	32241.937000	1	1	1.000000	170	177	0.960452	5.801014	38.661386
14	PM2EXP	0.274092	91.940595	100.000000	1.000000e+02	100.000000	100.000000	106	2	53.000000	61	177	0.344633	-3.141557	9.032386

Supervised Learning : Exploring Regularization Approaches for Controlling Model Complexity Through Weight Penalization for Neural Network Classification¶

John Pauline Pineda

April 25, 2024

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Cleaning ¶

1.4.2 Missing Data Imputation ¶

1.4.3 Outlier Detection ¶

1.4.4 Collinearity ¶

1.4.5 Shape Transformation ¶

1.4.6 Centering and Scaling ¶

1.4.7 Data Encoding ¶

1.4.8 Preprocessed Data Description ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Neural Network Classification Gradient and Weight Updates ¶

1.6.1 Premodelling Data Description ¶

1.6.2 No Regularization ¶

1.6.3 L1 Regularization ¶

1.6.4 L2 Regularization ¶

1.6.5 ElasticNet Regularization ¶

1.7. Consolidated Findings ¶

2. Summary ¶

3. References ¶

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Null.Count	Fill.Rate
0	COUNTRY	object	177	177	0	1.000000
1	CANRAT	category	177	177	0	1.000000
2	GDPPER	float64	177	165	12	0.932203
3	URBPOP	float64	177	174	3	0.983051
4	PATRES	float64	177	108	69	0.610169
5	RNDGDP	float64	177	74	103	0.418079
6	POPGRO	float64	177	174	3	0.983051
7	LIFEXP	float64	177	174	3	0.983051
8	TUBINC	float64	177	174	3	0.983051
9	DTHCMD	float64	177	170	7	0.960452
10	AGRLND	float64	177	174	3	0.983051
11	GHGEMI	float64	177	170	7	0.960452
12	RELOUT	float64	177	153	24	0.864407
13	METEMI	float64	177	170	7	0.960452
14	FORARE	float64	177	173	4	0.977401
15	CO2EMI	float64	177	170	7	0.960452
16	PM2EXP	float64	177	167	10	0.943503
17	POPDEN	float64	177	174	3	0.983051
18	ENRTER	float64	177	116	61	0.655367
19	GDPCAP	float64	177	170	7	0.960452
20	HDICAT	category	177	167	10	0.943503
21	EPISCO	float64	177	165	12	0.932203

	Row.Name	Column.Count	Null.Count	Missing.Rate
35	Guadeloupe	22	20	0.909091
39	Martinique	22	20	0.909091
56	French Guiana	22	20	0.909091
13	New Caledonia	22	11	0.500000
44	French Polynesia	22	11	0.500000
75	Guam	22	11	0.500000
53	Puerto Rico	22	9	0.409091
85	North Korea	22	6	0.272727
168	South Sudan	22	6	0.272727
132	Somalia	22	6	0.272727
117	Libya	22	5	0.227273
73	Venezuela	22	5	0.227273
161	Eritrea	22	5	0.227273
164	Yemen	22	5	0.227273

	Categorical.Column.Name	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio
0	CANRAT	Low	High	132	45	2.933333	2	177	0.011299
1	HDICAT	VH	H	59	39	1.512821	4	177	0.022599

	Numeric.Column.Name	Skewness	Outlier.Count	Row.Count	Outlier.Ratio
0	GDPPER	1.554457	3	163	0.018405
1	URBPOP	-0.212327	0	163	0.000000
2	POPGRO	-0.181666	0	163	0.000000
3	LIFEXP	-0.329704	0	163	0.000000
4	TUBINC	1.747962	12	163	0.073620
5	DTHCMD	0.930709	0	163	0.000000
6	AGRLND	0.035315	0	163	0.000000
7	GHGEMI	9.299960	27	163	0.165644
8	METEMI	5.688689	20	163	0.122699
9	FORARE	0.563015	0	163	0.000000
10	CO2EMI	2.693585	11	163	0.067485
11	PM2EXP	-3.088403	37	163	0.226994
12	POPDEN	9.972806	20	163	0.122699
13	GDPCAP	2.311079	22	163	0.134969
14	EPISCO	0.635994	3	163	0.018405

	Pearson.Correlation.Coefficient	Correlation.PValue
GDPPER_GDPCAP	0.921010	8.158179e-68
GHGEMI_METEMI	0.905121	1.087643e-61
POPGRO_DTHCMD	0.759470	7.124695e-32
GDPPER_LIFEXP	0.755787	2.055178e-31
GDPCAP_EPISCO	0.696707	5.312642e-25
LIFEXP_GDPCAP	0.683834	8.321371e-24
GDPPER_EPISCO	0.680812	1.555304e-23
GDPPER_URBPOP	0.666394	2.781623e-22
GDPPER_CO2EMI	0.654958	2.450029e-21
TUBINC_DTHCMD	0.643615	1.936081e-20
URBPOP_LIFEXP	0.623997	5.669778e-19
LIFEXP_EPISCO	0.620271	1.048393e-18
URBPOP_GDPCAP	0.559181	8.624533e-15
CO2EMI_GDPCAP	0.550221	2.782997e-14
URBPOP_CO2EMI	0.550046	2.846393e-14
LIFEXP_CO2EMI	0.531305	2.951829e-13
URBPOP_EPISCO	0.510131	3.507463e-12
POPGRO_TUBINC	0.442339	3.384403e-09
DTHCMD_PM2EXP	0.283199	2.491837e-04
CO2EMI_EPISCO	0.282734	2.553620e-04

	T.Test.Statistic	T.Test.PValue
CANRAT_GDPCAP	-11.936988	6.247937e-24
CANRAT_EPISCO	-11.788870	1.605980e-23
CANRAT_LIFEXP	-10.979098	2.754214e-21
CANRAT_TUBINC	9.608760	1.463678e-17
CANRAT_DTHCMD	8.375558	2.552108e-14
CANRAT_CO2EMI	-7.030702	5.537463e-11
CANRAT_URBPOP	-6.541001	7.734940e-10
CANRAT_POPGRO	4.904817	2.269446e-06
CANRAT_GHGEMI	-2.243089	2.625563e-02
CANRAT_FORARE	-1.174143	2.420717e-01
CANRAT_POPDEN	-0.495221	6.211191e-01
CANRAT_AGRLND	-0.047628	9.620720e-01

	ChiSquare.Test.Statistic	ChiSquare.Test.PValue
CANRAT_HDICAT_VH	76.764134	1.926446e-18
CANRAT_HDICAT_M	13.860367	1.969074e-04
CANRAT_HDICAT_L	10.285575	1.340742e-03
CANRAT_HDICAT_H	9.080788	2.583087e-03

	URBPOP	POPGRO	LIFEXP	TUBINC	DTHCMD	AGRLND	GHGEMI	FORARE	CO2EMI	POPDEN	GDPCAP	EPISCO	CANRAT
0	1.186561	0.075944	1.643195	-1.102296	-0.971464	0.377324	1.388807	-0.467775	1.736841	-2.208974	1.549766	1.306738	High
1	1.207291	0.916022	1.487969	-1.102296	-1.091413	0.043134	0.367037	0.398501	0.943507	-0.989532	1.407752	1.102912	High
2	0.172100	-0.100235	1.537044	-1.275298	-0.836295	1.162279	0.211987	-0.815470	1.031680	-0.007131	1.879374	1.145832	High
3	1.024859	-0.155217	0.664178	-1.696341	-0.903718	0.296508	2.565441	0.259803	1.627748	-0.522844	1.685426	0.739753	High
4	1.271466	-0.718131	1.381877	-1.413414	-0.657145	1.162434	0.019979	-0.559264	0.686270	0.512619	1.657777	2.218327	High

	Metric	Value	Method
0	ACCURACY	0.920245	No Regularization
1	LOSS	0.205037	No Regularization
2	LAYER 1 MEAN WEIGHT	0.878401	No Regularization
3	LAYER 2 MEAN WEIGHT	0.663882	No Regularization
4	LAYER 3 MEAN WEIGHT	0.662323	No Regularization
5	LAYER 4 MEAN WEIGHT	1.042851	No Regularization

	Metric	Value	Method
0	ACCURACY	0.944785	L1 Regularization
1	LOSS	0.179959	L1 Regularization
2	LAYER 1 MEAN WEIGHT	0.493351	L1 Regularization
3	LAYER 2 MEAN WEIGHT	0.413750	L1 Regularization
4	LAYER 3 MEAN WEIGHT	0.390757	L1 Regularization
5	LAYER 4 MEAN WEIGHT	0.715697	L1 Regularization

	Metric	Value	Method
0	ACCURACY	0.944785	L2 Regularization
1	LOSS	0.175098	L2 Regularization
2	LAYER 1 MEAN WEIGHT	0.537587	L2 Regularization
3	LAYER 2 MEAN WEIGHT	0.503285	L2 Regularization
4	LAYER 3 MEAN WEIGHT	0.487109	L2 Regularization
5	LAYER 4 MEAN WEIGHT	0.749369	L2 Regularization

	Metric	Value	Method
0	ACCURACY	0.969325	ElasticNet Regularization
1	LOSS	0.109622	ElasticNet Regularization
2	LAYER 1 MEAN WEIGHT	0.275964	ElasticNet Regularization
3	LAYER 2 MEAN WEIGHT	0.374856	ElasticNet Regularization
4	LAYER 3 MEAN WEIGHT	0.371434	ElasticNet Regularization
5	LAYER 4 MEAN WEIGHT	0.596937	ElasticNet Regularization

Supervised Learning : Exploring Regularization Approaches for Controlling Model Complexity Through Weight Penalization for Neural Network Classification¶

John Pauline Pineda April 25, 2024

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Cleaning ¶

1.4.2 Missing Data Imputation ¶

1.4.3 Outlier Detection ¶

1.4.4 Collinearity ¶

1.4.5 Shape Transformation ¶

1.4.6 Centering and Scaling ¶

1.4.7 Data Encoding ¶

1.4.8 Preprocessed Data Description ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Neural Network Classification Gradient and Weight Updates ¶

1.6.1 Premodelling Data Description ¶

1.6.2 No Regularization ¶

1.6.3 L1 Regularization ¶

1.6.4 L2 Regularization ¶

1.6.5 ElasticNet Regularization ¶

1.7. Consolidated Findings ¶

2. Summary ¶

3. References ¶

John Pauline Pineda

April 25, 2024