##################################
# Loading Python Libraries
##################################
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import os
%matplotlib inline

from operator import add,mul,truediv
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler
from scipy import stats

from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.pipeline import Pipeline

##################################
# Setting Global Options
##################################
np.set_printoptions(suppress=True, precision=4)
pd.options.display.float_format = '{:.4f}'.format

##################################
# Defining file paths
##################################
DATASETS_ORIGINAL_PATH = r"datasets\original"

##################################
# Loading the dataset
# from the DATASETS_ORIGINAL_PATH
##################################
cancer_rate = pd.read_csv(os.path.join("..", DATASETS_ORIGINAL_PATH, "NumericCancerRates.csv"))

##################################
# Performing a general exploration of the dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate.dtypes)

Column Names and Data Types:

COUNTRY     object
CANRAT     float64
GDPPER     float64
URBPOP     float64
PATRES     float64
RNDGDP     float64
POPGRO     float64
LIFEXP     float64
TUBINC     float64
DTHCMD     float64
AGRLND     float64
GHGEMI     float64
RELOUT     float64
METEMI     float64
FORARE     float64
CO2EMI     float64
PM2EXP     float64
POPDEN     float64
ENRTER     float64
GDPCAP     float64
HDICAT      object
EPISCO     float64
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
cancer_rate.head()

##################################
# Setting the levels of the categorical variables
##################################
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].astype('category')
cancer_rate['HDICAT'] = cancer_rate['HDICAT'].cat.set_categories(['L', 'M', 'H', 'VH'], ordered=True)

##################################
# Performing a general exploration of the numeric variables
##################################
print('Numeric Variable Summary:')
display(cancer_rate.describe(include='number').transpose())

Numeric Variable Summary:

##################################
# Performing a general exploration of the object variable
##################################
print('Object Variable Summary:')
display(cancer_rate.describe(include='object').transpose())

Object Variable Summary:

##################################
# Performing a general exploration of the categorical variable
##################################
print('Categorical Variable Summary:')
display(cancer_rate.describe(include='category').transpose())

Categorical Variable Summary:

##################################
# Counting the number of duplicated rows
##################################
cancer_rate.duplicated().sum()

np.int64(0)

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate)] * len(cancer_rate.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all columns
##################################
all_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                              data_type_list,
                                              row_count_list,
                                              non_null_count_list,
                                              null_count_list,
                                              fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(all_column_quality_summary)

##################################
# Counting the number of columns
# with Fill.Rate < 1.00
##################################
len(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)])

20

##################################
# Identifying the columns
# with Fill.Rate < 1.00
##################################
display(all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<1)].sort_values(by=['Fill.Rate'], ascending=True))

##################################
# Identifying the rows
# with Fill.Rate < 0.90
##################################
column_low_fill_rate = all_column_quality_summary[(all_column_quality_summary['Fill.Rate']<0.90)]

##################################
# Gathering the metadata labels for each observation
##################################
row_metadata_list = cancer_rate["COUNTRY"].values.tolist()

##################################
# Gathering the number of columns for each observation
##################################
column_count_list = list([len(cancer_rate.columns)] * len(cancer_rate))

##################################
# Gathering the number of missing data for each row
##################################
null_row_list = list(cancer_rate.isna().sum(axis=1))

##################################
# Gathering the missing data percentage for each column
##################################
missing_rate_list = map(truediv, null_row_list, column_count_list)

##################################
# Identifying the rows
# with missing data
##################################
all_row_quality_summary = pd.DataFrame(zip(row_metadata_list,
                                           column_count_list,
                                           null_row_list,
                                           missing_rate_list), 
                                        columns=['Row.Name',
                                                 'Column.Count',
                                                 'Null.Count',                                                 
                                                 'Missing.Rate'])
display(all_row_quality_summary)

##################################
# Counting the number of rows
# with Missing.Rate > 0.00
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.00)])

120

##################################
# Counting the number of rows
# with Missing.Rate > 0.20
##################################
len(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)])

14

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
row_high_missing_rate = all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)]

##################################
# Identifying the rows
# with Missing.Rate > 0.20
##################################
display(all_row_quality_summary[(all_row_quality_summary['Missing.Rate']>0.20)].sort_values(by=['Missing.Rate'], ascending=False))

##################################
# Formulating the dataset
# with numeric columns only
##################################
cancer_rate_numeric = cancer_rate.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = cancer_rate_numeric.columns

##################################
# Gathering the minimum value for each numeric column
##################################
numeric_minimum_list = cancer_rate_numeric.min()

##################################
# Gathering the mean value for each numeric column
##################################
numeric_mean_list = cancer_rate_numeric.mean()

##################################
# Gathering the median value for each numeric column
##################################
numeric_median_list = cancer_rate_numeric.median()

##################################
# Gathering the maximum value for each numeric column
##################################
numeric_maximum_list = cancer_rate_numeric.max()

##################################
# Gathering the first mode values for each numeric column
##################################
numeric_first_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[0] for x in cancer_rate_numeric]

##################################
# Gathering the second mode values for each numeric column
##################################
numeric_second_mode_list = [cancer_rate[x].value_counts(dropna=True).index.tolist()[1] for x in cancer_rate_numeric]

##################################
# Gathering the count of first mode values for each numeric column
##################################
numeric_first_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the count of second mode values for each numeric column
##################################
numeric_second_mode_count_list = [cancer_rate_numeric[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_numeric]

##################################
# Gathering the first mode to second mode ratio for each numeric column
##################################
numeric_first_second_mode_ratio_list = map(truediv, numeric_first_mode_count_list, numeric_second_mode_count_list)

##################################
# Gathering the count of unique values for each numeric column
##################################
numeric_unique_count_list = cancer_rate_numeric.nunique(dropna=True)

##################################
# Gathering the number of observations for each numeric column
##################################
numeric_row_count_list = list([len(cancer_rate_numeric)] * len(cancer_rate_numeric.columns))

##################################
# Gathering the unique to count ratio for each numeric column
##################################
numeric_unique_count_ratio_list = map(truediv, numeric_unique_count_list, numeric_row_count_list)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_numeric.skew()

##################################
# Gathering the kurtosis value for each numeric column
##################################
numeric_kurtosis_list = cancer_rate_numeric.kurtosis()

numeric_column_quality_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                numeric_minimum_list,
                                                numeric_mean_list,
                                                numeric_median_list,
                                                numeric_maximum_list,
                                                numeric_first_mode_list,
                                                numeric_second_mode_list,
                                                numeric_first_mode_count_list,
                                                numeric_second_mode_count_list,
                                                numeric_first_second_mode_ratio_list,
                                                numeric_unique_count_list,
                                                numeric_row_count_list,
                                                numeric_unique_count_ratio_list,
                                                numeric_skewness_list,
                                                numeric_kurtosis_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Minimum',
                                                 'Mean',
                                                 'Median',
                                                 'Maximum',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio',
                                                 'Skewness',
                                                 'Kurtosis'])
display(numeric_column_quality_summary)

##################################
# Counting the number of numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)])

1

##################################
# Identifying the numeric columns
# with First.Second.Mode.Ratio > 5.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['First.Second.Mode.Ratio']>5)].sort_values(by=['First.Second.Mode.Ratio'], ascending=False))

##################################
# Counting the number of numeric columns
# with Unique.Count.Ratio > 10.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Counting the number of numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
len(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))])

5

##################################
# Identifying the numeric columns
# with Skewness > 3.00 or Skewness < -3.00
##################################
display(numeric_column_quality_summary[(numeric_column_quality_summary['Skewness']>3) | (numeric_column_quality_summary['Skewness']<(-3))].sort_values(by=['Skewness'], ascending=False))

##################################
# Formulating the dataset
# with object column only
##################################
cancer_rate_object = cancer_rate.select_dtypes(include='object')

##################################
# Gathering the variable names for the object column
##################################
object_variable_name_list = cancer_rate_object.columns

##################################
# Gathering the first mode values for the object column
##################################
object_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_object]

##################################
# Gathering the second mode values for each object column
##################################
object_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_object]

##################################
# Gathering the count of first mode values for each object column
##################################
object_first_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_object]

##################################
# Gathering the count of second mode values for each object column
##################################
object_second_mode_count_list = [cancer_rate_object[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_object]

##################################
# Gathering the first mode to second mode ratio for each object column
##################################
object_first_second_mode_ratio_list = map(truediv, object_first_mode_count_list, object_second_mode_count_list)

##################################
# Gathering the count of unique values for each object column
##################################
object_unique_count_list = cancer_rate_object.nunique(dropna=True)

##################################
# Gathering the number of observations for each object column
##################################
object_row_count_list = list([len(cancer_rate_object)] * len(cancer_rate_object.columns))

##################################
# Gathering the unique to count ratio for each object column
##################################
object_unique_count_ratio_list = map(truediv, object_unique_count_list, object_row_count_list)

object_column_quality_summary = pd.DataFrame(zip(object_variable_name_list,
                                                 object_first_mode_list,
                                                 object_second_mode_list,
                                                 object_first_mode_count_list,
                                                 object_second_mode_count_list,
                                                 object_first_second_mode_ratio_list,
                                                 object_unique_count_list,
                                                 object_row_count_list,
                                                 object_unique_count_ratio_list), 
                                        columns=['Object.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(object_column_quality_summary)

##################################
# Counting the number of object columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of object columns
# with Unique.Count.Ratio > 10.00
##################################
len(object_column_quality_summary[(object_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Formulating the dataset
# with categorical columns only
##################################
cancer_rate_categorical = cancer_rate.select_dtypes(include='category')

##################################
# Gathering the variable names for the categorical column
##################################
categorical_variable_name_list = cancer_rate_categorical.columns

##################################
# Gathering the first mode values for each categorical column
##################################
categorical_first_mode_list = [cancer_rate[x].value_counts().index.tolist()[0] for x in cancer_rate_categorical]

##################################
# Gathering the second mode values for each categorical column
##################################
categorical_second_mode_list = [cancer_rate[x].value_counts().index.tolist()[1] for x in cancer_rate_categorical]

##################################
# Gathering the count of first mode values for each categorical column
##################################
categorical_first_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[0]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the count of second mode values for each categorical column
##################################
categorical_second_mode_count_list = [cancer_rate_categorical[x].isin([cancer_rate[x].value_counts(dropna=True).index.tolist()[1]]).sum() for x in cancer_rate_categorical]

##################################
# Gathering the first mode to second mode ratio for each categorical column
##################################
categorical_first_second_mode_ratio_list = map(truediv, categorical_first_mode_count_list, categorical_second_mode_count_list)

##################################
# Gathering the count of unique values for each categorical column
##################################
categorical_unique_count_list = cancer_rate_categorical.nunique(dropna=True)

##################################
# Gathering the number of observations for each categorical column
##################################
categorical_row_count_list = list([len(cancer_rate_categorical)] * len(cancer_rate_categorical.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
categorical_unique_count_ratio_list = map(truediv, categorical_unique_count_list, categorical_row_count_list)

categorical_column_quality_summary = pd.DataFrame(zip(categorical_variable_name_list,
                                                    categorical_first_mode_list,
                                                    categorical_second_mode_list,
                                                    categorical_first_mode_count_list,
                                                    categorical_second_mode_count_list,
                                                    categorical_first_second_mode_ratio_list,
                                                    categorical_unique_count_list,
                                                    categorical_row_count_list,
                                                    categorical_unique_count_ratio_list), 
                                        columns=['Categorical.Column.Name',
                                                 'First.Mode',
                                                 'Second.Mode',
                                                 'First.Mode.Count',
                                                 'Second.Mode.Count',
                                                 'First.Second.Mode.Ratio',
                                                 'Unique.Count',
                                                 'Row.Count',
                                                 'Unique.Count.Ratio'])
display(categorical_column_quality_summary)

##################################
# Counting the number of categorical columns
# with First.Second.Mode.Ratio > 5.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['First.Second.Mode.Ratio']>5)])

0

##################################
# Counting the number of categorical columns
# with Unique.Count.Ratio > 10.00
##################################
len(categorical_column_quality_summary[(categorical_column_quality_summary['Unique.Count.Ratio']>10)])

0

##################################
# Performing a general exploration of the original dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate.shape)

Dataset Dimensions:

(177, 22)

##################################
# Filtering out the rows with
# with Missing.Rate > 0.20
##################################
cancer_rate_filtered_row = cancer_rate.drop(cancer_rate[cancer_rate.COUNTRY.isin(row_high_missing_rate['Row.Name'].values.tolist())].index)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_filtered_row.shape)

Dataset Dimensions:

(163, 22)

##################################
# Filtering out the columns with
# with Fill.Rate < 0.90
##################################
cancer_rate_filtered_row_column = cancer_rate_filtered_row.drop(column_low_fill_rate['Column.Name'].values.tolist(), axis=1)

##################################
# Formulating a new dataset object
# for the cleaned data
##################################
cancer_rate_cleaned = cancer_rate_filtered_row_column

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_cleaned.shape)

Dataset Dimensions:

(163, 18)

##################################
# Formulating the summary
# for all cleaned columns
##################################
cleaned_column_quality_summary = pd.DataFrame(zip(list(cancer_rate_cleaned.columns),
                                                  list(cancer_rate_cleaned.dtypes),
                                                  list([len(cancer_rate_cleaned)] * len(cancer_rate_cleaned.columns)),
                                                  list(cancer_rate_cleaned.count()),
                                                  list(cancer_rate_cleaned.isna().sum(axis=0))), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count'])
display(cleaned_column_quality_summary)

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='object')

##################################
# Formulating the cleaned dataset
# with numeric columns only
##################################
cancer_rate_cleaned_numeric = cancer_rate_cleaned.select_dtypes(include='number')

##################################
# Taking a snapshot of the cleaned dataset
##################################
cancer_rate_cleaned_numeric.head()

##################################
# Defining the estimator to be used
# at each step of the round-robin imputation
##################################
lr = LinearRegression()

##################################
# Defining the parameter of the
# iterative imputer which will estimate 
# the columns with missing values
# as a function of the other columns
# in a round-robin fashion
##################################
iterative_imputer = IterativeImputer(
    estimator = lr,
    max_iter = 10,
    tol = 1e-10,
    imputation_order = 'ascending',
    random_state=88888888
)

##################################
# Implementing the iterative imputer 
##################################
cancer_rate_imputed_numeric_array = iterative_imputer.fit_transform(cancer_rate_cleaned_numeric)

##################################
# Transforming the imputed data
# from an array to a dataframe
##################################
cancer_rate_imputed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array, 
                                           columns = cancer_rate_cleaned_numeric.columns)

##################################
# Taking a snapshot of the imputed dataset
##################################
cancer_rate_imputed_numeric.head()

##################################
# Formulating the cleaned dataset
# with categorical columns only
##################################
cancer_rate_cleaned_categorical = cancer_rate_cleaned.select_dtypes(include='category')

##################################
# Imputing the missing data
# for categorical columns with
# the most frequent category
##################################
cancer_rate_cleaned_categorical['HDICAT'] = cancer_rate_cleaned_categorical['HDICAT'].fillna(cancer_rate_cleaned_categorical['HDICAT'].mode()[0])
cancer_rate_imputed_categorical = cancer_rate_cleaned_categorical.reset_index(drop=True)

##################################
# Formulating the imputed dataset
##################################
cancer_rate_imputed = pd.concat([cancer_rate_imputed_numeric,cancer_rate_imputed_categorical], axis=1, join='inner')

##################################
# Gathering the data types for each column
##################################
data_type_list = list(cancer_rate_imputed.dtypes)

##################################
# Gathering the variable names for each column
##################################
variable_name_list = list(cancer_rate_imputed.columns)

##################################
# Gathering the number of observations for each column
##################################
row_count_list = list([len(cancer_rate_imputed)] * len(cancer_rate_imputed.columns))

##################################
# Gathering the number of missing data for each column
##################################
null_count_list = list(cancer_rate_imputed.isna().sum(axis=0))

##################################
# Gathering the number of non-missing data for each column
##################################
non_null_count_list = list(cancer_rate_imputed.count())

##################################
# Gathering the missing data percentage for each column
##################################
fill_rate_list = map(truediv, non_null_count_list, row_count_list)

##################################
# Formulating the summary
# for all imputed columns
##################################
imputed_column_quality_summary = pd.DataFrame(zip(variable_name_list,
                                                  data_type_list,
                                                  row_count_list,
                                                  non_null_count_list,
                                                  null_count_list,
                                                  fill_rate_list), 
                                        columns=['Column.Name',
                                                 'Column.Type',
                                                 'Row.Count',
                                                 'Non.Null.Count',
                                                 'Null.Count',                                                 
                                                 'Fill.Rate'])
display(imputed_column_quality_summary)

##################################
# Formulating the imputed dataset
# with numeric columns only
##################################
cancer_rate_imputed_numeric = cancer_rate_imputed.select_dtypes(include='number')

##################################
# Gathering the variable names for each numeric column
##################################
numeric_variable_name_list = list(cancer_rate_imputed_numeric.columns)

##################################
# Gathering the skewness value for each numeric column
##################################
numeric_skewness_list = cancer_rate_imputed_numeric.skew()

##################################
# Computing the interquartile range
# for all columns
##################################
cancer_rate_imputed_numeric_q1 = cancer_rate_imputed_numeric.quantile(0.25)
cancer_rate_imputed_numeric_q3 = cancer_rate_imputed_numeric.quantile(0.75)
cancer_rate_imputed_numeric_iqr = cancer_rate_imputed_numeric_q3 - cancer_rate_imputed_numeric_q1

##################################
# Gathering the outlier count for each numeric column
# based on the interquartile range criterion
##################################
numeric_outlier_count_list = ((cancer_rate_imputed_numeric < (cancer_rate_imputed_numeric_q1 - 1.5 * cancer_rate_imputed_numeric_iqr)) | (cancer_rate_imputed_numeric > (cancer_rate_imputed_numeric_q3 + 1.5 * cancer_rate_imputed_numeric_iqr))).sum()

##################################
# Gathering the number of observations for each column
##################################
numeric_row_count_list = list([len(cancer_rate_imputed_numeric)] * len(cancer_rate_imputed_numeric.columns))

##################################
# Gathering the unique to count ratio for each categorical column
##################################
numeric_outlier_ratio_list = map(truediv, numeric_outlier_count_list, numeric_row_count_list)

##################################
# Formulating the outlier summary
# for all numeric columns
##################################
numeric_column_outlier_summary = pd.DataFrame(zip(numeric_variable_name_list,
                                                  numeric_skewness_list,
                                                  numeric_outlier_count_list,
                                                  numeric_row_count_list,
                                                  numeric_outlier_ratio_list), 
                                        columns=['Numeric.Column.Name',
                                                 'Skewness',
                                                 'Outlier.Count',
                                                 'Row.Count',
                                                 'Outlier.Ratio'])
display(numeric_column_outlier_summary)

##################################
# Formulating the individual boxplots
# for all numeric columns
##################################
for column in cancer_rate_imputed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_imputed_numeric, x=column)

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
def plot_correlation_matrix(corr, mask=None):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, 
                ax=ax,
                mask=mask,
                annot=True, 
                vmin=-1, 
                vmax=1, 
                center=0,
                cmap='coolwarm', 
                linewidths=1, 
                linecolor='gray', 
                cbar_kws={'orientation': 'horizontal'})

##################################
# Computing the correlation coefficients
# and correlation p-values
# among pairs of numeric columns
##################################
cancer_rate_imputed_numeric_correlation_pairs = {}
cancer_rate_imputed_numeric_columns = cancer_rate_imputed_numeric.columns.tolist()
for numeric_column_a, numeric_column_b in itertools.combinations(cancer_rate_imputed_numeric_columns, 2):
    cancer_rate_imputed_numeric_correlation_pairs[numeric_column_a + '_' + numeric_column_b] = stats.pearsonr(
        cancer_rate_imputed_numeric.loc[:, numeric_column_a], 
        cancer_rate_imputed_numeric.loc[:, numeric_column_b])

##################################
# Formulating the pairwise correlation summary
# for all numeric columns
##################################
cancer_rate_imputed_numeric_summary = cancer_rate_imputed_numeric.from_dict(cancer_rate_imputed_numeric_correlation_pairs, orient='index')
cancer_rate_imputed_numeric_summary.columns = ['Pearson.Correlation.Coefficient', 'Correlation.PValue']
display(cancer_rate_imputed_numeric_summary.sort_values(by=['Pearson.Correlation.Coefficient'], ascending=False).head(20))

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
##################################
cancer_rate_imputed_numeric_correlation = cancer_rate_imputed_numeric.corr()
mask = np.triu(cancer_rate_imputed_numeric_correlation)
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)
plt.show()

##################################
# Formulating a function 
# to plot the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
def correlation_significance(df=None):
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
    return p_matrix

##################################
# Plotting the correlation matrix
# for all pairwise combinations
# of numeric columns
# with significant p-values only
##################################
cancer_rate_imputed_numeric_correlation_p_values = correlation_significance(cancer_rate_imputed_numeric)                     
mask = np.invert(np.tril(cancer_rate_imputed_numeric_correlation_p_values<0.05)) 
plot_correlation_matrix(cancer_rate_imputed_numeric_correlation,mask)

##################################
# Filtering out one among the 
# highly correlated variable pairs with
# lesser Pearson.Correlation.Coefficient
# when compared to the target variable
##################################
cancer_rate_imputed_numeric.drop(['GDPPER','METEMI'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_imputed_numeric.shape)

Dataset Dimensions:

(163, 14)

##################################
# Conducting a Yeo-Johnson Transformation
# to address the distributional
# shape of the variables
##################################
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson',
                                          standardize=False)
cancer_rate_imputed_numeric_array = yeo_johnson_transformer.fit_transform(cancer_rate_imputed_numeric)

##################################
# Formulating a new dataset object
# for the transformed data
##################################
cancer_rate_transformed_numeric = pd.DataFrame(cancer_rate_imputed_numeric_array,
                                               columns=cancer_rate_imputed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_transformed_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_transformed_numeric, x=column)

##################################
# Filtering out the column
# which remained skewed even
# after applying shape transformation
##################################
cancer_rate_transformed_numeric.drop(['PM2EXP'], inplace=True, axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_transformed_numeric.shape)

Dataset Dimensions:

(163, 13)

##################################
# Conducting standardization
# to transform the values of the 
# variables into comparable scale
##################################
standardization_scaler = StandardScaler()
cancer_rate_transformed_numeric_array = standardization_scaler.fit_transform(cancer_rate_transformed_numeric)

##################################
# Formulating a new dataset object
# for the scaled data
##################################
cancer_rate_scaled_numeric = pd.DataFrame(cancer_rate_transformed_numeric_array,
                                          columns=cancer_rate_transformed_numeric.columns)

##################################
# Formulating the individual boxplots
# for all transformed numeric columns
##################################
for column in cancer_rate_scaled_numeric:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=cancer_rate_scaled_numeric, x=column)

##################################
# Formulating the categorical column
# for encoding transformation
##################################
cancer_rate_categorical_encoded = pd.DataFrame(cancer_rate_cleaned_categorical.loc[:, 'HDICAT'].to_list(),
                                               columns=['HDICAT'])

##################################
# Applying a one-hot encoding transformation
# for the categorical column
##################################
cancer_rate_categorical_encoded = pd.get_dummies(cancer_rate_categorical_encoded, columns=['HDICAT'])

##################################
# Consolidating both numeric columns
# and encoded categorical columns
##################################
cancer_rate_preprocessed = pd.concat([cancer_rate_scaled_numeric,cancer_rate_categorical_encoded], axis=1, join='inner')

##################################
# Performing a general exploration of the consolidated dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_preprocessed.shape)

Dataset Dimensions:

(163, 17)

##################################
# Segregating the target
# and predictor variable lists
##################################
cancer_rate_preprocessed_target = ['CANRAT']
cancer_rate_preprocessed_predictors = cancer_rate_preprocessed.drop('CANRAT', axis=1).columns

##################################
# Segregating the target
# and predictor variable names
##################################
y_variable = 'CANRAT'
x_variables = cancer_rate_preprocessed_predictors

##################################
# Defining the number of 
# rows and columns for the subplots
##################################
num_rows = 8
num_cols = 2

##################################
# Formulating the subplot structure
##################################
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 40))

##################################
# Flattening the multi-row and
# multi-column axes
##################################
axes = axes.ravel()

##################################
# Formulating the individual scatterplots
# for all scaled numeric columns
##################################
for i, x_variable in enumerate(x_variables):
    ax = axes[i]
    ax.scatter(cancer_rate_preprocessed[x_variable],cancer_rate_preprocessed[y_variable])
    ax.set_title(f'{y_variable} Versus {x_variable}')
    ax.set_xlabel(x_variable)
    ax.set_ylabel(y_variable)

##################################
# Adjusting the subplot layout
##################################
plt.tight_layout()

##################################
# Presenting the subplots
##################################
plt.show()

##################################
# Computing the correlation coefficients
# and correlation p-values
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_correlation_target = {}
cancer_rate_preprocessed_numeric = cancer_rate_preprocessed.drop(['HDICAT_L','HDICAT_M','HDICAT_H','HDICAT_VH'], axis=1)
cancer_rate_preprocessed_numeric_columns = cancer_rate_preprocessed_numeric.columns.tolist()
for numeric_column in cancer_rate_preprocessed_numeric_columns:
    cancer_rate_preprocessed_numeric_correlation_target['CANRAT_' + numeric_column] = stats.pearsonr(
        cancer_rate_preprocessed_numeric.loc[:, 'CANRAT'], 
        cancer_rate_preprocessed_numeric.loc[:, numeric_column])

##################################
# Formulating the pairwise correlation summary
# between the target variable
# and numeric predictor columns
##################################
cancer_rate_preprocessed_numeric_summary = cancer_rate_preprocessed_numeric.from_dict(cancer_rate_preprocessed_numeric_correlation_target, orient='index')
cancer_rate_preprocessed_numeric_summary.columns = ['Pearson.Correlation.Coefficient', 'Correlation.PValue']
display(cancer_rate_preprocessed_numeric_summary.sort_values(by=['Correlation.PValue'], ascending=True).head(13))

##################################
# Computing the t-test 
# statistic and p-values
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_ttest_target = {}
cancer_rate_preprocessed_categorical = cancer_rate_preprocessed[['CANRAT','HDICAT_L','HDICAT_M','HDICAT_H','HDICAT_VH']]
cancer_rate_preprocessed_categorical_columns = ['HDICAT_L','HDICAT_M','HDICAT_H','HDICAT_VH']
for categorical_column in cancer_rate_preprocessed_categorical_columns:
    group_0 = cancer_rate_preprocessed_categorical[cancer_rate_preprocessed_categorical.loc[:,categorical_column]==0]
    group_1 = cancer_rate_preprocessed_categorical[cancer_rate_preprocessed_categorical.loc[:,categorical_column]==1]
    cancer_rate_preprocessed_categorical_ttest_target['CANRAT_' + categorical_column] = stats.ttest_ind(
        group_0['CANRAT'], 
        group_1['CANRAT'], 
        equal_var=True)

##################################
# Formulating the pairwise ttest summary
# between the target variable
# and categorical predictor columns
##################################
cancer_rate_preprocessed_categorical_summary = cancer_rate_preprocessed_categorical.from_dict(cancer_rate_preprocessed_categorical_ttest_target, orient='index')
cancer_rate_preprocessed_categorical_summary.columns = ['T.Test.Statistic', 'T.Test.PValue']
display(cancer_rate_preprocessed_categorical_summary.sort_values(by=['T.Test.PValue'], ascending=True).head(4))

##################################
# Consolidating relevant numeric columns
# and encoded categorical columns
# after hypothesis testing
##################################
cancer_rate_premodelling = cancer_rate_preprocessed.drop(['AGRLND','POPDEN','GHGEMI','FORARE','POPGRO','URBPOP','HDICAT_H','HDICAT_M','HDICAT_L'], axis=1)

##################################
# Performing a general exploration of the filtered dataset
##################################
print('Dataset Dimensions: ')
display(cancer_rate_premodelling.shape)

Dataset Dimensions:

(163, 8)

##################################
# Listing the column names and data types
##################################
print('Column Names and Data Types:')
display(cancer_rate_premodelling.dtypes)

Column Names and Data Types:

CANRAT       float64
LIFEXP       float64
TUBINC       float64
DTHCMD       float64
CO2EMI       float64
GDPCAP       float64
EPISCO       float64
HDICAT_VH       bool
dtype: object

##################################
# Taking a snapshot of the dataset
##################################
cancer_rate_premodelling.head()

##################################
# Gathering the pairplot for all variables
##################################
sns.pairplot(cancer_rate_premodelling, kind='reg')
plt.show()

##################################
# Separating the target 
# and predictor columns
##################################
X = cancer_rate_premodelling.drop('CANRAT', axis = 1)
y = cancer_rate_premodelling.CANRAT

##################################
# Formulating the train and test data
# using a 70-30 ratio
##################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 88888888)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_train.shape)

Dataset Dimensions:

(114, 7)

##################################
# Performing a general exploration of the train dataset
##################################
print('Dataset Dimensions: ')
display(X_test.shape)

Dataset Dimensions:

(49, 7)

##################################
# Defining a function to compute
# model performance
##################################
def model_performance_evaluation(y_true, y_pred):
    metric_name = ['R2','MSE','MAE']
    metric_value = [r2_score(y_true, y_pred),
                   mean_squared_error(y_true, y_pred),
                   mean_absolute_error(y_true, y_pred)]    
    metric_summary = pd.DataFrame(zip(metric_name, metric_value),
                                  columns=['metric_name','metric_value']) 
    return(metric_summary)

##################################
# Defining a function to investigate
# model performance with respect to
# the regularization parameter
##################################
def rmse_alpha_plot(model_type):
    MSE=[]
    coefs = []
    for alpha in alphas:
        model = model_type(alpha=alpha)
        model.fit(X_train, y_train)
        coefs.append(abs(model.coef_))
        y_pred = model.predict(X_test)
        MSE.append(mean_squared_error(y_test, y_pred))

    ax = plt.gca()
    ax.plot(alphas, MSE)
    ax.set_xscale("log")
    plt.xlabel("Alpha")
    plt.ylabel("Mean Squared Error")
    plt.title("Mean Squared Error versus Alpha Regularization")
    plt.show()
    
    
def rmse_l1_ratio_plot(model_type):
    MSE=[]
    coefs = []
    for l1_ratio in l1_ratios:
        model = model_type(l1_ratio=l1_ratio)
        model.fit(X_train, y_train)
        coefs.append(abs(model.coef_))
        y_pred = model.predict(X_test)
        MSE.append(mean_squared_error(y_test, y_pred))

    ax = plt.gca()
    ax.plot(l1_ratios, MSE)
    plt.xlabel("L1_Ratio")
    plt.ylabel("Mean Squared Error")
    plt.title("Mean Squared Error versus L1_Ratio Regularization")
    plt.show()

##################################
# Defining a pipeline for the 
# linear regression model
##################################
linear_regression_pipeline = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False, degree=1)), 
                                       ('linear_regression', LinearRegression())])

##################################
# Fitting a linear regression model
##################################
linear_regression_pipeline.fit(X_train, y_train)

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('linear_regression', LinearRegression())])

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('linear_regression', LinearRegression())])

PolynomialFeatures(degree=1, include_bias=False)

LinearRegression()

##################################
# Evaluating the linear regression model
# on the train set
##################################
linear_y_hat_train = linear_regression_pipeline.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
linear_performance_train = model_performance_evaluation(y_train, linear_y_hat_train)
linear_performance_train['model'] = ['linear_regression'] * 3
linear_performance_train['set'] = ['train'] * 3
print('Linear Regression Model Performance on Train Data: ')
display(linear_performance_train)

Linear Regression Model Performance on Train Data:

##################################
# Evaluating the linear regression model
# on the test set
##################################
linear_y_hat_test = linear_regression_pipeline.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
linear_performance_test = model_performance_evaluation(y_test, linear_y_hat_test)
linear_performance_test['model'] = ['linear_regression'] * 3
linear_performance_test['set'] = ['test'] * 3
print('Linear Regression Model Performance on Test Data: ')
display(linear_performance_test)

Linear Regression Model Performance on Test Data:

##################################
# Plotting the actual and predicted
# target variables
##################################
figure = plt.figure(figsize=(10,6))
axes = plt.axes()
plt.grid(True)
axes.plot(y_test, 
          linear_y_hat_test, 
          marker='o', 
          ls='', 
          ms=3.0)
lim = (-2, 2)
axes.set(xlabel='Actual Cancer Rate', 
         ylabel='Predicted Cancer Rate', 
         xlim=lim,
         ylim=lim,
         title='Linear Regression Model Prediction Performance');

##################################
# Defining a pipeline for the 
# polynomial regression model
##################################
polynomial_regression_pipeline = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False, degree=2)), 
                                           ('polynomial_regression', LinearRegression())])

##################################
# Fitting a polynomial regression model
##################################
polynomial_regression_pipeline.fit(X_train, y_train)

Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                ('polynomial_regression', LinearRegression())])

Pipeline(steps=[('polynomial_features', PolynomialFeatures(include_bias=False)),
                ('polynomial_regression', LinearRegression())])

PolynomialFeatures(include_bias=False)

LinearRegression()

##################################
# Evaluating the polynomial regression model
# on the train set
##################################
polynomial_y_hat_train = polynomial_regression_pipeline.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
polynomial_performance_train = model_performance_evaluation(y_train, polynomial_y_hat_train)
polynomial_performance_train['model'] = ['polynomial_regression'] * 3
polynomial_performance_train['set'] = ['train'] * 3
print('Polynomial Regression Model Performance on Train Data: ')
display(polynomial_performance_train)

Polynomial Regression Model Performance on Train Data:

##################################
# Evaluating the polynomial regression model
# on the test set
##################################
polynomial_y_hat_test = polynomial_regression_pipeline.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
polynomial_performance_test = model_performance_evaluation(y_test, polynomial_y_hat_test)
polynomial_performance_test['model'] = ['polynomial_regression'] * 3
polynomial_performance_test['set'] = ['test'] * 3
print('Polynomial Regression Model Performance on Test Data: ')
display(polynomial_performance_test)

Polynomial Regression Model Performance on Test Data:

##################################
# Plotting the actual and predicted
# target variables
##################################
figure = plt.figure(figsize=(10,6))
axes = plt.axes()
plt.grid(True)
axes.plot(y_test, 
          polynomial_y_hat_test, 
          marker='o', 
          ls='', 
          ms=3.0)
lim = (-2, 2)
axes.set(xlabel='Actual Cancer Rate', 
         ylabel='Predicted Cancer Rate', 
         xlim=lim,
         ylim=lim,
         title='Polynomial Regression Model Prediction Performance');

##################################
# Defining the hyperparameters
# for the ridge regression model
##################################
alphas = [0.0001,0.001,0.01,0.1,1,10,100,1000]

##################################
# Formulating a string equivalent 
# of the alpha hyperparameter list
##################################
alphas_string = map(str, alphas)
alphas_string = list(alphas_string)

##################################
# Defining a pipeline for the 
# ridge regression model
##################################
ridge_regression_pipeline = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False, degree=1)), 
                                      ('ridge_regression', RidgeCV(alphas=alphas, cv=None, store_cv_values=True))])

##################################
# Fitting a ridge regression model
##################################
ridge_regression_pipeline.fit(X_train, y_train)

D:\Github_Codes\ProjectPortfolio\Portfolio_Project_40\mlexplore_venv\Lib\site-packages\sklearn\linear_model\_ridge.py:2341: FutureWarning: 'store_cv_values' is deprecated in version 1.5 and will be removed in 1.7. Use 'store_cv_results' instead.
  warnings.warn(

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge_regression',
                 RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         store_cv_values=True))])

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge_regression',
                 RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         store_cv_values=True))])

PolynomialFeatures(degree=1, include_bias=False)

RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        store_cv_values=True)

##################################
# Determining the optimal alpha
##################################
ridge_regression_pipeline['ridge_regression'].alpha_

np.float64(10.0)

##################################
# Consolidating the LOOCV results
##################################
ridge_regression_pipeline['ridge_regression'].cv_values_
ridge_regression_LOOCV = pd.DataFrame(ridge_regression_pipeline['ridge_regression'].cv_values_,columns=map(str, alphas) )
ridge_regression_LOOCV.index.name = 'case_index'
ridge_regression_LOOCV.reset_index(inplace=True)
ridge_regression_LOOCV = pd.melt(ridge_regression_LOOCV, 
                                 id_vars = ['case_index'], 
                                 value_vars = alphas_string, 
                                 ignore_index = False)
ridge_regression_LOOCV.rename(columns = {'variable':'alpha', 'value':'MSE'}, inplace = True)
display(ridge_regression_LOOCV)

D:\Github_Codes\ProjectPortfolio\Portfolio_Project_40\mlexplore_venv\Lib\site-packages\sklearn\utils\deprecation.py:102: FutureWarning: Attribute `cv_values_` is deprecated in version 1.5 and will be removed in 1.7. Use `cv_results_` instead.
  warnings.warn(msg, category=FutureWarning)

##################################
# Plotting the ridge regression
# hyperparameter tuning results
# using LOOCV
##################################
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
plt.grid(True)
sns.boxplot(x='alpha', 
            y='MSE', 
            data=ridge_regression_LOOCV, 
            color="#0070C0",
            showmeans=True,
            meanprops={'marker':'o',
                       'markerfacecolor':'#ADD8E6', 
                       'markeredgecolor':'#FF0000',
                       'markersize':'8'})
plt.xlabel('Alpha Hyperparameter')
plt.ylabel('Mean Squared Error')
plt.title('Ridge Regression Hyperparameter Tuning Using LOOCV')
plt.show()

##################################
# Evaluating the ridge regression model
# on the train set
##################################
ridge_y_hat_train = ridge_regression_pipeline.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
ridge_performance_train = model_performance_evaluation(y_train, ridge_y_hat_train)
ridge_performance_train['model'] = ['ridge_regression'] * 3
ridge_performance_train['set'] = ['train'] * 3
print('Ridge Regression Model Performance on Train Data: ')
display(ridge_performance_train)

Ridge Regression Model Performance on Train Data:

##################################
# Evaluating the ridge regression model
# on the test set
##################################
ridge_y_hat_test = ridge_regression_pipeline.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
ridge_performance_test = model_performance_evaluation(y_test, ridge_y_hat_test)
ridge_performance_test['model'] = ['ridge_regression'] * 3
ridge_performance_test['set'] = ['test'] * 3
print('Ridge Regression Model Performance on Test Data: ')
display(ridge_performance_test)

Ridge Regression Model Performance on Test Data:

##################################
# Plotting the actual and predicted
# target variables
##################################
figure = plt.figure(figsize=(10,6))
axes = plt.axes()
plt.grid(True)
axes.plot(y_test, 
          ridge_y_hat_test, 
          marker='o', 
          ls='', 
          ms=3.0)
lim = (-2, 2)
axes.set(xlabel='Actual Cancer Rate', 
         ylabel='Predicted Cancer Rate', 
         xlim=lim,
         ylim=lim,
         title='Ridge Regression Model Prediction Performance');

##################################
# Defining the hyperparameters
# for the lasso regression model
##################################
alphas = [0.0001,0.001,0.01,0.1,1,10,100,1000]

##################################
# Formulating a string equivalent 
# of the alpha hyperparameter list
##################################
alphas_string = map(str, alphas)
alphas_string = list(alphas_string)

##################################
# Defining a pipeline for the 
# lasso regression model
##################################
lasso_regression_pipeline = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False, degree=1)), 
                                      ('lasso_regression', LassoCV(alphas=alphas, cv=LeaveOneOut()))])

##################################
# Fitting a lasso regression model
##################################
lasso_regression_pipeline.fit(X_train, y_train)

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('lasso_regression',
                 LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         cv=LeaveOneOut()))])

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('lasso_regression',
                 LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         cv=LeaveOneOut()))])

PolynomialFeatures(degree=1, include_bias=False)

LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], cv=LeaveOneOut())

##################################
# Determining the optimal alpha
##################################
lasso_regression_pipeline['lasso_regression'].alpha_

np.float64(0.01)

##################################
# Consolidating the LOOCV results
##################################
lasso_regression_pipeline['lasso_regression'].mse_path_
lasso_regression_LOOCV = pd.DataFrame(lasso_regression_pipeline['lasso_regression'].mse_path_.transpose())
lasso_regression_LOOCV = lasso_regression_LOOCV[[7,6,5,4,3,2,1,0]]
lasso_regression_LOOCV = lasso_regression_LOOCV.set_axis(map(str, alphas), axis=1)
lasso_regression_LOOCV.index.name = 'case_index'
lasso_regression_LOOCV.reset_index(inplace=True)
display(lasso_regression_LOOCV)

##################################
# Transforming the dataframe
# detailing the LOOCV results
##################################
lasso_regression_LOOCV = pd.melt(lasso_regression_LOOCV, 
                                 id_vars = ['case_index'], 
                                 value_vars = alphas_string, 
                                 ignore_index = False)
lasso_regression_LOOCV.rename(columns = {'variable':'alpha', 'value':'MSE'}, inplace = True)
display(lasso_regression_LOOCV)

##################################
# Plotting the lasso regression
# hyperparameter tuning results
# using LOOCV
##################################
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
plt.grid(True)
sns.boxplot(x='alpha', 
            y='MSE', 
            data=lasso_regression_LOOCV, 
            color='#0070C0',
            showmeans=True,
            meanprops={'marker':'o',
                       'markerfacecolor':'#ADD8E6', 
                       'markeredgecolor':'#FF0000',
                       'markersize':'8'})
plt.xlabel('Alpha Hyperparameter')
plt.ylabel('Mean Squared Error')
plt.title('Lasso Regression Hyperparameter Tuning Using LOOCV')
plt.show()

##################################
# Evaluating the lasso regression model
# on the train set
##################################
lasso_y_hat_train = lasso_regression_pipeline.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
lasso_performance_train = model_performance_evaluation(y_train, lasso_y_hat_train)
lasso_performance_train['model'] = ['lasso_regression'] * 3
lasso_performance_train['set'] = ['train'] * 3
print('Lasso Regression Model Performance on Train Data: ')
display(lasso_performance_train)

Lasso Regression Model Performance on Train Data:

##################################
# Evaluating the lasso regression model
# on the test set
##################################
lasso_y_hat_test = lasso_regression_pipeline.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
lasso_performance_test = model_performance_evaluation(y_test, lasso_y_hat_test)
lasso_performance_test['model'] = ['lasso_regression'] * 3
lasso_performance_test['set'] = ['test'] * 3
print('Lasso Regression Model Performance on Test Data: ')
display(lasso_performance_test)

Lasso Regression Model Performance on Test Data:

##################################
# Plotting the actual and predicted
# target variables
##################################
figure = plt.figure(figsize=(10,6))
axes = plt.axes()
plt.grid(True)
axes.plot(y_test, 
          lasso_y_hat_test, 
          marker='o', 
          ls='', 
          ms=3.0)
lim = (-2, 2)
axes.set(xlabel='Actual Cancer Rate', 
         ylabel='Predicted Cancer Rate', 
         xlim=lim,
         ylim=lim,
         title='Lasso Regression Model Prediction Performance');

##################################
# Defining the hyperparameters
# for the elastic-net regression model
##################################
l1_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
alphas = [0.0001,0.001,0.01,0.1,1,10,100,1000]

##################################
# Formulating a string equivalent 
# of the l1_ratio hyperparameter list
##################################
l1_ratios_string = map(str, l1_ratios)
l1_ratios_string_reversed = list(l1_ratios_string)
l1_ratios_string_reversed.reverse()

##################################
# Formulating a string equivalent 
# of the alpha hyperparameter list
##################################
alphas_string = map(str, alphas)
alphas_string_reversed = list(alphas_string)
alphas_string_reversed.reverse()

##################################
# Defining a pipeline for the 
# elastic-net regression model
##################################
elasticnet_regression_pipeline = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False, degree=1)), 
                                           ('elasticnet_regression', ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=LeaveOneOut()))])

##################################
# Fitting an elastic-net regression model
##################################
elasticnet_regression_pipeline.fit(X_train, y_train)

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('elasticnet_regression',
                 ElasticNetCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                      1000],
                              cv=LeaveOneOut(),
                              l1_ratio=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                        0.9]))])

Pipeline(steps=[('polynomial_features',
                 PolynomialFeatures(degree=1, include_bias=False)),
                ('elasticnet_regression',
                 ElasticNetCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                      1000],
                              cv=LeaveOneOut(),
                              l1_ratio=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                        0.9]))])

PolynomialFeatures(degree=1, include_bias=False)

ElasticNetCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
             cv=LeaveOneOut(),
             l1_ratio=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

##################################
# Determining the optimal alpha
##################################
elasticnet_regression_pipeline['elasticnet_regression'].alpha_

np.float64(0.01)

##################################
# Determining the optimal l1_ratio
##################################
elasticnet_regression_pipeline['elasticnet_regression'].l1_ratio_

np.float64(0.9)

##################################
# Consolidating the LOOCV results
##################################
elasticnet_regression_LOOCV_raw = elasticnet_regression_pipeline['elasticnet_regression'].mse_path_
elasticnet_regression_LOOCV = pd.DataFrame(elasticnet_regression_LOOCV_raw.reshape(-1, 114))
elasticnet_regression_LOOCV.index = np.repeat(np.arange(elasticnet_regression_LOOCV_raw.shape[0]), elasticnet_regression_LOOCV_raw.shape[1]) + 1
elasticnet_regression_LOOCV.index.name = 'l1_ratio'
elasticnet_regression_LOOCV.reset_index(inplace=True)

##################################
# Creating a dataframe based on l1_ratio
# from the LOOCV results
##################################
elasticnet_regression_LOOCV_l1_ratio = elasticnet_regression_LOOCV
display(elasticnet_regression_LOOCV_l1_ratio)

##################################
# Creating a dataframe based on alpha
# from the LOOCV results
##################################
elasticnet_regression_LOOCV_alpha = elasticnet_regression_LOOCV.drop(['l1_ratio'], axis=1)
elasticnet_regression_LOOCV_alpha['alpha'] = list(range(1,9))*9
display(elasticnet_regression_LOOCV_alpha)

##################################
# Transforming the l1_ratio dataframe
# detailing the LOOCV results
##################################
elasticnet_regression_LOOCV_l1_ratio = pd.melt(elasticnet_regression_LOOCV_l1_ratio, 
                                               id_vars=['l1_ratio'], 
                                               value_vars=list(range(0,114)), 
                                               ignore_index=False)
elasticnet_regression_LOOCV_l1_ratio.rename(columns = {'variable':'case_index', 'value':'MSE'}, inplace = True)
display(elasticnet_regression_LOOCV_l1_ratio)

##################################
# Renaming the l1_ratio levels
# based on the proper category labels
##################################
elasticnet_regression_LOOCV_l1_ratio_conditions = [
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 1),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 2),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 3),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 4),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 5),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 6),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 7),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 8),
    (elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] == 9)]

elasticnet_regression_LOOCV_l1_ratio_values = l1_ratios_string_reversed
elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] = np.select(elasticnet_regression_LOOCV_l1_ratio_conditions,
                                                             elasticnet_regression_LOOCV_l1_ratio_values,
                                                             default=str(np.nan))
elasticnet_regression_LOOCV_l1_ratio['l1_ratio'] = elasticnet_regression_LOOCV_l1_ratio['l1_ratio'].astype('category')

##################################
# Plotting the elasticnet regression
# hyperparameter tuning results
# using LOOCV
##################################
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
plt.grid(True)
sns.boxplot(x='l1_ratio', 
            y='MSE', 
            data=elasticnet_regression_LOOCV_l1_ratio, 
            color='#0070C0', 
            showmeans=True,
            meanprops={'marker':'o',
                       'markerfacecolor':'#ADD8E6', 
                       'markeredgecolor':'#FF0000',
                       'markersize':'8'})
plt.xlabel('L1 Ratio Hyperparameter')
plt.ylabel('Mean Squared Error')
plt.title('Elastic Net Regression Hyperparameter Tuning Using LOOCV')
plt.show()

##################################
# Transforming the alpha dataframe
# detailing the LOOCV results
##################################
elasticnet_regression_LOOCV_alpha = pd.melt(elasticnet_regression_LOOCV_alpha, 
                                               id_vars=['alpha'], 
                                               value_vars=list(range(0,114)), 
                                               ignore_index=False)
elasticnet_regression_LOOCV_alpha.rename(columns = {'variable':'case_index', 'value':'MSE'}, inplace = True)
display(elasticnet_regression_LOOCV_alpha)

##################################
# Renaming the alpha levels
# based on the proper category labels
##################################
elasticnet_regression_LOOCV_alpha_conditions = [
    (elasticnet_regression_LOOCV_alpha['alpha'] == 1),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 2),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 3),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 4),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 5),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 6),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 7),
    (elasticnet_regression_LOOCV_alpha['alpha'] == 8)]

elasticnet_regression_LOOCV_alpha_values = alphas_string_reversed
elasticnet_regression_LOOCV_alpha['alpha'] = np.select(elasticnet_regression_LOOCV_alpha_conditions,
                                                       elasticnet_regression_LOOCV_alpha_values,
                                                       default=str(np.nan))
elasticnet_regression_LOOCV_alpha['alpha'] = elasticnet_regression_LOOCV_alpha['alpha'].astype('category')

##################################
# Plotting the elasticnet regression
# hyperparameter tuning results
# using LOOCV
##################################
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
plt.grid(True)
sns.boxplot(x='alpha', 
            y='MSE', 
            data=elasticnet_regression_LOOCV_alpha, 
            color='#0070C0', 
            showmeans=True,
            meanprops={'marker':'o',
                       'markerfacecolor':'#ADD8E6', 
                       'markeredgecolor':'#FF0000',
                       'markersize':'8'})
plt.xlabel('Alpha Hyperparameter')
plt.ylabel('Mean Squared Error')
plt.title('Elastic Net Regression Hyperparameter Tuning Using LOOCV')
plt.show()

##################################
# Evaluating the elastic-net regression model
# on the train set
##################################
elasticnet_y_hat_train = elasticnet_regression_pipeline.predict(X_train)

##################################
# Gathering the model evaluation metrics
##################################
elasticnet_performance_train = model_performance_evaluation(y_train, elasticnet_y_hat_train)
elasticnet_performance_train['model'] = ['elasticnet_regression'] * 3
elasticnet_performance_train['set'] = ['train'] * 3
print('Elastic Net Regression Model Performance on Train Data: ')
display(elasticnet_performance_train)

Elastic Net Regression Model Performance on Train Data:

##################################
# Evaluating the elastic-net regression model
# on the test set
##################################
elasticnet_y_hat_test = elasticnet_regression_pipeline.predict(X_test)

##################################
# Gathering the model evaluation metrics
##################################
elasticnet_performance_test = model_performance_evaluation(y_test, elasticnet_y_hat_test)
elasticnet_performance_test['model'] = ['elasticnet_regression'] * 3
elasticnet_performance_test['set'] = ['test'] * 3
print('Elastic Net Regression Model Performance on Test Data: ')
display(elasticnet_performance_test)

Elastic Net Regression Model Performance on Test Data:

##################################
# Plotting the actual and predicted
# target variables
##################################
figure = plt.figure(figsize=(10,6))
axes = plt.axes()
plt.grid(True)
axes.plot(y_test, 
          elasticnet_y_hat_test, 
          marker='o', 
          ls='', 
          ms=3.0)
lim = (-2, 2)
axes.set(xlabel='Actual Cancer Rate', 
         ylabel='Predicted Cancer Rate', 
         xlim=lim,
         ylim=lim,
         title='Elastic Net Regression Model Prediction Performance');

##################################
# Consolidating all the
# model performance measures
##################################
performance_comparison = pd.concat([linear_performance_train, 
                                    linear_performance_test,
                                    polynomial_performance_train, 
                                    polynomial_performance_test,
                                    ridge_performance_train, 
                                    ridge_performance_test,
                                    lasso_performance_train, 
                                    lasso_performance_test,
                                    elasticnet_performance_train, 
                                    elasticnet_performance_test], 
                                   ignore_index=True)
print('Consolidated Model Performance on Train and Test Data: ')
display(performance_comparison)

Consolidated Model Performance on Train and Test Data:

##################################
# Consolidating all the R2
# model performance measures
##################################
performance_comparison_R2 = performance_comparison[performance_comparison['metric_name']=='R2']
performance_comparison_R2_train = performance_comparison_R2[performance_comparison_R2['set']=='train'].loc[:,"metric_value"]
performance_comparison_R2_test = performance_comparison_R2[performance_comparison_R2['set']=='test'].loc[:,"metric_value"]

##################################
# Plotting all the R2
# model performance measures
# between train and test sets
##################################
performance_comparison_R2_plot = pd.DataFrame({'train': performance_comparison_R2_train.values,
                                              'test': performance_comparison_R2_test.values},
                                              index=performance_comparison_R2['model'].unique())
performance_comparison_R2_plot = performance_comparison_R2_plot.plot.barh(figsize=(10, 6))
performance_comparison_R2_plot.set_xlim(0.00,1.00)
performance_comparison_R2_plot.set_title("Model Comparison by R-Squared Performance on Test Data")
performance_comparison_R2_plot.set_xlabel("R-Squared Performance")
performance_comparison_R2_plot.set_ylabel("Regression Model")
performance_comparison_R2_plot.grid(False)
performance_comparison_R2_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in performance_comparison_R2_plot.containers:
    performance_comparison_R2_plot.bar_label(container, fmt='%.5f', padding=-50)

##################################
# Consolidating all the MSE
# model performance measures
##################################
performance_comparison_MSE = performance_comparison[performance_comparison['metric_name']=='MSE']
performance_comparison_MSE_train = performance_comparison_MSE[performance_comparison_MSE['set']=='train'].loc[:,"metric_value"]
performance_comparison_MSE_test = performance_comparison_MSE[performance_comparison_MSE['set']=='test'].loc[:,"metric_value"]

##################################
# Plotting all the MSE
# model performance measures
# between train and test sets
##################################
performance_comparison_MSE_plot = pd.DataFrame({'train': performance_comparison_MSE_train.values,
                                                'test': performance_comparison_MSE_test.values},
                                               index=performance_comparison_MSE['model'].unique())
performance_comparison_MSE_plot = performance_comparison_MSE_plot.plot.barh(figsize=(10, 6))
performance_comparison_MSE_plot.set_xlim(0.00,1.00)
performance_comparison_MSE_plot.set_title("Model Comparison by Mean Squared Error Performance on Test Data")
performance_comparison_MSE_plot.set_xlabel("Mean Squared Error Performance")
performance_comparison_MSE_plot.set_ylabel("Regression Model")
performance_comparison_MSE_plot.grid(False)
performance_comparison_MSE_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in performance_comparison_MSE_plot.containers:
    performance_comparison_MSE_plot.bar_label(container, fmt='%.5f', padding=-50)

##################################
# Consolidating all the MAE
# model performance measures
##################################
performance_comparison_MAE = performance_comparison[performance_comparison['metric_name']=='MAE']
performance_comparison_MAE_train = performance_comparison_MAE[performance_comparison_MAE['set']=='train'].loc[:,"metric_value"]
performance_comparison_MAE_test = performance_comparison_MAE[performance_comparison_MAE['set']=='test'].loc[:,"metric_value"]

##################################
# Plotting all the MAE
# model performance measures
# between train and test sets
##################################
performance_comparison_MAE_plot = pd.DataFrame({'train': performance_comparison_MAE_train.values,
                                                'test': performance_comparison_MAE_test.values},
                                               index=performance_comparison_MAE['model'].unique())
performance_comparison_MAE_plot = performance_comparison_MAE_plot.plot.barh(figsize=(10, 6))
performance_comparison_MAE_plot.set_xlim(0.00,1.00)
performance_comparison_MAE_plot.set_title("Model Comparison by Mean Absolute Error Performance on Test Data")
performance_comparison_MAE_plot.set_xlabel("Mean Absolute Error Performance")
performance_comparison_MAE_plot.set_ylabel("Regression Model")
performance_comparison_MAE_plot.grid(False)
performance_comparison_MAE_plot.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
for container in performance_comparison_MAE_plot.containers:
    performance_comparison_MAE_plot.bar_label(container, fmt='%.5f', padding=-50)

from IPython.display import display, HTML
display(HTML("<style>.rendered_html { font-size: 15px; font-family: 'Trebuchet MS'; }</style>"))

	COUNTRY	CANRAT	GDPPER	URBPOP	PATRES	RNDGDP	POPGRO	LIFEXP	TUBINC	DTHCMD	...	RELOUT	METEMI	FORARE	CO2EMI	PM2EXP	POPDEN	ENRTER	GDPCAP	HDICAT	EPISCO
0	Australia	452.4000	98380.6360	86.2410	2368.0000	NaN	1.2357	83.2000	7.2000	4.9411	...	13.6378	131484.7632	17.4213	14.7727	24.8936	3.3353	110.1392	51722.0690	VH	60.1000
1	New Zealand	422.9000	77541.7644	86.6990	348.0000	NaN	2.2048	82.2561	7.2000	4.3547	...	80.0814	32241.9370	37.5701	6.1608	NaN	19.3316	75.7348	41760.5948	VH	56.7000
2	Ireland	372.8000	198405.8750	63.6530	75.0000	1.2324	1.0291	82.5561	5.3000	5.6846	...	27.9654	15252.8246	11.3517	6.7682	0.2741	72.3673	74.6803	85420.1909	VH	57.4000
3	United States	362.2000	130941.6369	82.6640	269586.0000	3.4229	0.9643	76.9805	2.3000	5.3021	...	13.2286	748241.4029	33.8669	13.0328	3.3432	36.2410	87.5677	63528.6343	VH	51.1000
4	Denmark	351.1000	113300.6011	88.1160	1261.0000	2.9687	0.2916	81.6024	4.1000	6.8261	...	65.5059	7778.7739	15.7110	4.6912	56.9145	145.7851	82.6643	60915.4244	VH	77.9000

	count	mean	std	min	25%	50%	75%	max
CANRAT	177.0000	183.8294	79.7434	78.4000	118.1000	155.3000	240.4000	452.4000
GDPPER	165.0000	45284.4243	39417.9404	1718.8049	13545.2545	34024.9009	66778.4160	234646.9049
URBPOP	174.0000	59.7881	22.8064	13.3450	42.4327	61.7015	79.1865	100.0000
PATRES	108.0000	20607.3889	134068.3101	1.0000	35.2500	244.5000	1297.7500	1344817.0000
RNDGDP	74.0000	1.1975	1.1900	0.0398	0.2564	0.8737	1.6088	5.3545
POPGRO	174.0000	1.1270	1.1977	-2.0793	0.2369	1.1800	2.0312	3.7271
LIFEXP	174.0000	71.7461	7.6062	52.7770	65.9075	72.4646	77.5235	84.5600
TUBINC	174.0000	105.0059	136.7229	0.7700	12.0000	44.5000	147.7500	592.0000
DTHCMD	170.0000	21.2605	19.2733	1.2836	6.0780	12.4563	36.9805	65.2079
AGRLND	174.0000	38.7935	21.7155	0.5128	20.1303	40.3866	54.0138	80.8411
GHGEMI	170.0000	259582.7099	1118549.7301	179.7252	12527.4874	41009.2760	116482.5786	12942868.3400
RELOUT	153.0000	39.7600	31.9149	0.0003	10.5827	32.3817	63.0115	100.0000
METEMI	170.0000	47876.1336	134661.0653	11.5961	3662.8849	11118.9760	32368.9090	1186285.1810
FORARE	173.0000	32.2182	23.1200	0.0081	11.6044	31.5090	49.0718	97.4121
CO2EMI	170.0000	3.7511	4.6065	0.0326	0.6319	2.2984	4.8235	31.7268
PM2EXP	167.0000	91.9406	22.0600	0.2741	99.6271	100.0000	100.0000	100.0000
POPDEN	174.0000	200.8868	645.3834	2.1151	27.4545	77.9831	153.9936	7918.9513
ENRTER	116.0000	49.9950	29.7062	2.4326	22.1072	53.3925	71.0575	143.3107
GDPCAP	170.0000	13992.0956	19579.5430	216.8274	1870.5030	5348.1929	17421.1162	117370.4969
EPISCO	165.0000	42.9467	12.4909	18.9000	33.0000	40.9000	50.5000	77.9000

	Row.Name	Column.Count	Null.Count	Missing.Rate
0	Australia	22	1	0.0455
1	New Zealand	22	2	0.0909
2	Ireland	22	0	0.0000
3	United States	22	0	0.0000
4	Denmark	22	0	0.0000
...	...	...	...	...
172	Congo Republic	22	3	0.1364
173	Bhutan	22	2	0.0909
174	Nepal	22	2	0.0909
175	Gambia	22	4	0.1818
176	Niger	22	2	0.0909

	Row.Name	Column.Count	Null.Count	Missing.Rate
35	Guadeloupe	22	20	0.9091
39	Martinique	22	20	0.9091
56	French Guiana	22	20	0.9091
13	New Caledonia	22	11	0.5000
44	French Polynesia	22	11	0.5000
75	Guam	22	11	0.5000
53	Puerto Rico	22	9	0.4091
85	North Korea	22	6	0.2727
168	South Sudan	22	6	0.2727
132	Somalia	22	6	0.2727
117	Libya	22	5	0.2273
73	Venezuela	22	5	0.2273
161	Eritrea	22	5	0.2273
164	Yemen	22	5	0.2273

	Numeric.Column.Name	Minimum	Mean	Median	Maximum	First.Mode	Second.Mode	First.Mode.Count	Second.Mode.Count	First.Second.Mode.Ratio	Unique.Count	Row.Count	Unique.Count.Ratio	Skewness	Kurtosis
0	CANRAT	78.4000	183.8294	155.3000	452.4000	135.3000	130.6000	3	2	1.5000	167	177	0.9435	0.8818	0.0635
1	GDPPER	1718.8049	45284.4243	34024.9009	234646.9049	98380.6360	77541.7644	1	1	1.0000	165	177	0.9322	1.5176	3.4720
2	URBPOP	13.3450	59.7881	61.7015	100.0000	100.0000	86.6990	2	1	2.0000	173	177	0.9774	-0.2107	-0.9628
3	PATRES	1.0000	20607.3889	244.5000	1344817.0000	6.0000	2.0000	4	3	1.3333	97	177	0.5480	9.2844	91.1872
4	RNDGDP	0.0398	1.1975	0.8737	5.3545	1.2324	3.4229	1	1	1.0000	74	177	0.4181	1.3967	1.6960
5	POPGRO	-2.0793	1.1270	1.1800	3.7271	1.2357	2.2048	1	1	1.0000	174	177	0.9831	-0.1952	-0.4236
6	LIFEXP	52.7770	71.7461	72.4646	84.5600	83.2000	82.2561	1	1	1.0000	174	177	0.9831	-0.3580	-0.6496
7	TUBINC	0.7700	105.0059	44.5000	592.0000	12.0000	4.1000	4	3	1.3333	131	177	0.7401	1.7463	2.4294
8	DTHCMD	1.2836	21.2605	12.4563	65.2079	4.9411	4.3547	1	1	1.0000	170	177	0.9605	0.9005	-0.6915
9	AGRLND	0.5128	38.7935	40.3866	80.8411	46.2525	38.5629	1	1	1.0000	174	177	0.9831	0.0740	-0.9262
10	GHGEMI	179.7252	259582.7099	41009.2760	12942868.3400	571903.1199	80158.0258	1	1	1.0000	170	177	0.9605	9.4961	101.6373
11	RELOUT	0.0003	39.7600	32.3817	100.0000	100.0000	80.0814	3	1	3.0000	151	177	0.8531	0.5011	-0.9818
12	METEMI	11.5961	47876.1336	11118.9760	1186285.1810	131484.7632	32241.9370	1	1	1.0000	170	177	0.9605	5.8010	38.6614
13	FORARE	0.0081	32.2182	31.5090	97.4121	17.4213	37.5701	1	1	1.0000	173	177	0.9774	0.5193	-0.3226
14	CO2EMI	0.0326	3.7511	2.2984	31.7268	14.7727	6.1608	1	1	1.0000	170	177	0.9605	2.7216	10.3116
15	PM2EXP	0.2741	91.9406	100.0000	100.0000	100.0000	100.0000	106	2	53.0000	61	177	0.3446	-3.1416	9.0324
16	POPDEN	2.1151	200.8868	77.9831	7918.9513	3.3353	19.3316	1	1	1.0000	174	177	0.9831	10.2678	119.9953
17	ENRTER	2.4326	49.9950	53.3925	143.3107	110.1392	75.7348	1	1	1.0000	116	177	0.6554	0.2759	-0.3929
18	GDPCAP	216.8274	13992.0956	5348.1929	117370.4969	51722.0690	41760.5948	1	1	1.0000	170	177	0.9605	2.2586	5.9387
19	EPISCO	18.9000	42.9467	40.9000	77.9000	29.6000	43.6000	3	3	1.0000	137	177	0.7740	0.6418	0.0352

Supervised Learning : Exploring Penalized Models for Predicting Numeric Responses¶

John Pauline Pineda

November 16, 2023

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Cleaning ¶

1.4.2 Missing Data Imputation ¶

1.4.3 Outlier Detection ¶

1.4.4 Collinearity ¶

1.4.5 Shape Transformation ¶

1.4.6 Centering and Scaling ¶

1.4.7 Data Encoding ¶

1.4.8 Preprocessed Data Description ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Model Development ¶

1.6.1 Premodelling Data Description ¶

1.6.2 Linear Regression ¶

1.6.3 Polynomial Regression ¶

1.6.4 Ridge Regression ¶

1.6.5 Least Absolute Shrinkage and Selection Operator Regression ¶

1.6.6 Elastic Net Regression ¶

1.7. Consolidated Findings ¶

2. Summary ¶

3. References ¶

	Column.Name	Column.Type	Row.Count	Non.Null.Count	Null.Count	Fill.Rate
0	COUNTRY	object	177	177	0	1.0000
1	CANRAT	float64	177	177	0	1.0000
2	GDPPER	float64	177	165	12	0.9322
3	URBPOP	float64	177	174	3	0.9831
4	PATRES	float64	177	108	69	0.6102
5	RNDGDP	float64	177	74	103	0.4181
6	POPGRO	float64	177	174	3	0.9831
7	LIFEXP	float64	177	174	3	0.9831
8	TUBINC	float64	177	174	3	0.9831
9	DTHCMD	float64	177	170	7	0.9605
10	AGRLND	float64	177	174	3	0.9831
11	GHGEMI	float64	177	170	7	0.9605
12	RELOUT	float64	177	153	24	0.8644
13	METEMI	float64	177	170	7	0.9605
14	FORARE	float64	177	173	4	0.9774
15	CO2EMI	float64	177	170	7	0.9605
16	PM2EXP	float64	177	167	10	0.9435
17	POPDEN	float64	177	174	3	0.9831
18	ENRTER	float64	177	116	61	0.6554
19	GDPCAP	float64	177	170	7	0.9605
20	HDICAT	category	177	167	10	0.9435
21	EPISCO	float64	177	165	12	0.9322

	Numeric.Column.Name	Skewness	Outlier.Count	Row.Count	Outlier.Ratio
0	CANRAT	0.9101	2	163	0.0123
1	GDPPER	1.5544	3	163	0.0184
2	URBPOP	-0.2123	0	163	0.0000
3	POPGRO	-0.1817	0	163	0.0000
4	LIFEXP	-0.3297	0	163	0.0000
5	TUBINC	1.7480	12	163	0.0736
6	DTHCMD	0.9307	0	163	0.0000
7	AGRLND	0.0353	0	163	0.0000
8	GHGEMI	9.3000	27	163	0.1656
9	METEMI	5.6887	20	163	0.1227
10	FORARE	0.5562	0	163	0.0000
11	CO2EMI	2.6936	11	163	0.0675
12	PM2EXP	-3.0616	37	163	0.2270
13	POPDEN	9.9728	20	163	0.1227
14	GDPCAP	2.3111	22	163	0.1350
15	EPISCO	0.6360	3	163	0.0184

	Pearson.Correlation.Coefficient	Correlation.PValue
GDPPER_GDPCAP	0.9210	0.0000
GHGEMI_METEMI	0.9051	0.0000
POPGRO_DTHCMD	0.7595	0.0000
GDPPER_LIFEXP	0.7558	0.0000
CANRAT_EPISCO	0.7126	0.0000
CANRAT_GDPCAP	0.6970	0.0000
GDPCAP_EPISCO	0.6967	0.0000
CANRAT_LIFEXP	0.6923	0.0000
CANRAT_GDPPER	0.6868	0.0000
LIFEXP_GDPCAP	0.6838	0.0000
GDPPER_EPISCO	0.6808	0.0000
GDPPER_URBPOP	0.6664	0.0000
GDPPER_CO2EMI	0.6550	0.0000
TUBINC_DTHCMD	0.6436	0.0000
URBPOP_LIFEXP	0.6240	0.0000
LIFEXP_EPISCO	0.6203	0.0000
URBPOP_GDPCAP	0.5592	0.0000
CO2EMI_GDPCAP	0.5502	0.0000
URBPOP_CO2EMI	0.5500	0.0000
LIFEXP_CO2EMI	0.5313	0.0000

	Pearson.Correlation.Coefficient	Correlation.PValue
CANRAT_CANRAT	1.0000	0.0000
CANRAT_GDPCAP	0.7351	0.0000
CANRAT_LIFEXP	0.7024	0.0000
CANRAT_DTHCMD	-0.6871	0.0000
CANRAT_EPISCO	0.6484	0.0000
CANRAT_TUBINC	-0.6289	0.0000
CANRAT_CO2EMI	0.5855	0.0000
CANRAT_POPGRO	-0.4985	0.0000
CANRAT_URBPOP	0.4794	0.0000
CANRAT_GHGEMI	0.2325	0.0028
CANRAT_FORARE	0.1653	0.0350
CANRAT_AGRLND	-0.0245	0.7560
CANRAT_POPDEN	0.0019	0.9808

	T.Test.Statistic	T.Test.PValue
CANRAT_HDICAT_VH	-10.6057	0.0000
CANRAT_HDICAT_L	6.5598	0.0000
CANRAT_HDICAT_M	5.1050	0.0000
CANRAT_HDICAT_H	-0.6360	0.5257

	CANRAT	LIFEXP	TUBINC	DTHCMD	CO2EMI	GDPCAP	EPISCO	HDICAT_VH
0	2.0765	1.6432	-1.1023	-0.9715	1.7368	1.5498	1.3067	True
1	1.9630	1.4880	-1.1023	-1.0914	0.9435	1.4078	1.1029	True
2	1.7428	1.5370	-1.2753	-0.8363	1.0317	1.8794	1.1458	True
3	1.6909	0.6642	-1.6963	-0.9037	1.6277	1.6854	0.7398	True
4	1.6342	1.3819	-1.4134	-0.6571	0.6863	1.6578	2.2183	True

	metric_name	metric_value	model	set
0	R2	0.6332	linear_regression	train
1	MSE	0.3550	linear_regression	train
2	MAE	0.4609	linear_regression	train

	metric_name	metric_value	model	set
0	R2	0.6446	linear_regression	test
1	MSE	0.3716	linear_regression	test
2	MAE	0.4773	linear_regression	test

	metric_name	metric_value	model	set
0	R2	0.7908	polynomial_regression	train
1	MSE	0.2024	polynomial_regression	train
2	MAE	0.3503	polynomial_regression	train

	metric_name	metric_value	model	set
0	R2	0.6324	polynomial_regression	test
1	MSE	0.3844	polynomial_regression	test
2	MAE	0.4867	polynomial_regression	test

	case_index	alpha	MSE
0	0	0.0001	0.0012
1	1	0.0001	0.0058
2	2	0.0001	0.0872
3	3	0.0001	0.3473
4	4	0.0001	0.1755
...	...	...	...
109	109	1000	0.1383
110	110	1000	0.5213
111	111	1000	1.7320
112	112	1000	0.0289
113	113	1000	0.0201

	case_index	0.0001	0.001	0.01	0.1	1	10	100	1000
0	0	0.0011	0.0007	0.0009	0.0000	0.0020	0.0020	0.0020	0.0020
1	1	0.0058	0.0057	0.0053	0.0074	1.5211	1.5211	1.5211	1.5211
2	2	0.0872	0.0872	0.0894	0.2171	1.5328	1.5328	1.5328	1.5328
3	3	0.3470	0.3452	0.3919	0.3913	0.1931	0.1931	0.1931	0.1931
4	4	0.1760	0.1800	0.2250	0.5213	2.3015	2.3015	2.3015	2.3015
...	...	...	...	...	...	...	...	...	...
109	109	0.1380	0.1374	0.1589	0.2133	0.1710	0.1710	0.1710	0.1710
110	110	0.0088	0.0075	0.0007	0.0089	1.1168	1.1168	1.1168	1.1168
111	111	2.1149	2.1371	2.3557	2.5285	1.1061	1.1061	1.1061	1.1061
112	112	0.0013	0.0020	0.0164	0.0582	0.0015	0.0015	0.0015	0.0015
113	113	0.2060	0.2151	0.2999	0.2414	0.2579	0.2579	0.2579	0.2579

	case_index	alpha	MSE
0	0	0.0001	0.0011
1	1	0.0001	0.0058
2	2	0.0001	0.0872
3	3	0.0001	0.3470
4	4	0.0001	0.1760
...	...	...	...
109	109	1000	0.1710
110	110	1000	1.1168
111	111	1000	1.1061
112	112	1000	0.0015
113	113	1000	0.2579

	l1_ratio	0	1	2	3	4	5	6	7	8	...	104	105	106	107	108	109	110	111	112	113
0	1	0.0020	1.5211	1.5328	0.1931	2.3015	0.6067	0.2884	1.2766	0.4167	...	0.0133	0.9307	2.7951	2.2956	0.6066	0.1710	1.1168	1.1061	0.0015	0.2579
1	1	0.0020	1.5211	1.5328	0.1931	2.3015	0.6067	0.2884	1.2766	0.4167	...	0.0133	0.9307	2.7951	2.2956	0.6066	0.1710	1.1168	1.1061	0.0015	0.2579
2	1	0.0020	1.5211	1.5328	0.1931	2.3015	0.6067	0.2884	1.2766	0.4167	...	0.0133	0.9307	2.7951	2.2956	0.6066	0.1710	1.1168	1.1061	0.0015	0.2579
3	1	0.0022	0.1173	0.3284	0.2850	0.8757	0.8514	0.1458	0.2705	0.0295	...	0.0014	1.8480	1.5636	0.6217	0.3866	0.1205	0.1191	2.3643	0.0781	0.0792
4	1	0.0049	0.0010	0.0994	0.3206	0.3641	0.8689	0.5433	0.0626	0.0132	...	0.0081	2.0241	1.5584	0.1701	0.3002	0.1235	0.0001	2.6688	0.0987	0.3930
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
67	9	0.0020	1.5211	1.5328	0.1931	2.3015	0.6067	0.2884	1.2766	0.4167	...	0.0133	0.9307	2.7951	2.2956	0.6066	0.1710	1.1168	1.1061	0.0015	0.2579
68	9	0.0001	0.0051	0.2029	0.3840	0.5113	0.8459	0.4257	0.1601	0.0067	...	0.0027	1.9887	1.6780	0.2703	0.2514	0.2032	0.0066	2.5404	0.0650	0.2614
69	9	0.0007	0.0052	0.0875	0.3870	0.2200	0.8004	0.7328	0.0480	0.0338	...	0.0085	1.7579	1.8444	0.1508	0.3006	0.1555	0.0007	2.3405	0.0158	0.2965
70	9	0.0007	0.0057	0.0872	0.3448	0.1798	0.7830	0.7569	0.0328	0.0315	...	0.0233	1.5690	2.0451	0.1154	0.2950	0.1373	0.0076	2.1355	0.0020	0.2146
71	9	0.0011	0.0058	0.0872	0.3470	0.1760	0.7811	0.7633	0.0314	0.0313	...	0.0253	1.5500	2.0788	0.1121	0.2942	0.1381	0.0088	2.1147	0.0013	0.2060

	metric_name	metric_value	model	set
0	R2	0.6220	ridge_regression	train
1	MSE	0.3659	ridge_regression	train
2	MAE	0.4680	ridge_regression	train

	metric_name	metric_value	model	set
0	R2	0.6352	ridge_regression	test
1	MSE	0.3815	ridge_regression	test
2	MAE	0.4838	ridge_regression	test

	metric_name	metric_value	model	set
0	R2	0.6295	lasso_regression	train
1	MSE	0.3586	lasso_regression	train
2	MAE	0.4608	lasso_regression	train

	metric_name	metric_value	model	set
0	R2	0.6405	lasso_regression	test
1	MSE	0.3760	lasso_regression	test
2	MAE	0.4805	lasso_regression	test

	metric_name	metric_value	model	set
0	R2	0.6298	elasticnet_regression	train
1	MSE	0.3582	elasticnet_regression	train
2	MAE	0.4608	elasticnet_regression	train

	metric_name	metric_value	model	set
0	R2	0.6409	elasticnet_regression	test
1	MSE	0.3755	elasticnet_regression	test
2	MAE	0.4800	elasticnet_regression	test

Supervised Learning : Exploring Penalized Models for Predicting Numeric Responses¶

John Pauline Pineda November 16, 2023

1. Table of Contents ¶

1.1. Data Background ¶

1.2. Data Description ¶

1.3. Data Quality Assessment ¶

1.4. Data Preprocessing ¶

1.4.1 Data Cleaning ¶

1.4.2 Missing Data Imputation ¶

1.4.3 Outlier Detection ¶

1.4.4 Collinearity ¶

1.4.5 Shape Transformation ¶

1.4.6 Centering and Scaling ¶

1.4.7 Data Encoding ¶

1.4.8 Preprocessed Data Description ¶

1.5. Data Exploration ¶

1.5.1 Exploratory Data Analysis ¶

1.5.2 Hypothesis Testing ¶

1.6. Model Development ¶

1.6.1 Premodelling Data Description ¶

1.6.2 Linear Regression ¶

1.6.3 Polynomial Regression ¶

1.6.4 Ridge Regression ¶

1.6.5 Least Absolute Shrinkage and Selection Operator Regression ¶

1.6.6 Elastic Net Regression ¶

1.7. Consolidated Findings ¶

2. Summary ¶

3. References ¶

John Pauline Pineda

November 16, 2023