# %load_ext autoreload
# %autoreload 2

# Import libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.tree import export_graphviz
from IPython.display import Image
from subprocess import call
import warnings
warnings.filterwarnings("ignore")
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from visualize_py import plot_confusion_matrix, plot_roc_curve, plot_feature_importance, \
    plot_feature_importance_log, plot_feature_importance_dec, plotVar, plotAge, plotContinuous, plotCategorical
from IPython.core.display import HTML as Center

Center(""" <style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style> """)

# Import data
df = pd.read_csv('../input/uci-heart-disease-data-set/processed.cleveland.data', header = None, 
                 names = ['age', 'sex', 'chest_pain_type', 'rest_blood_press', 'cholesterol',
                          'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate', 
                          'exer_ind_angina', 'st_depression', 'st_slope',                                                                                                         'num_major_vessels', 'thallium_scint', 'ca_disease'], 
                 index_col=None, usecols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 
                 sep=',', skipinitialspace=True)
df.head(20)
print(df.shape)

(303, 14)

# Type of data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  303 non-null    float64
 1   sex                  303 non-null    float64
 2   chest_pain_type      303 non-null    float64
 3   rest_blood_press     303 non-null    float64
 4   cholesterol          303 non-null    float64
 5   fasting_blood_sugar  303 non-null    float64
 6   rest_ecg             303 non-null    float64
 7   max_heart_rate       303 non-null    float64
 8   exer_ind_angina      303 non-null    float64
 9   st_depression        303 non-null    float64
 10  st_slope             303 non-null    float64
 11  num_major_vessels    303 non-null    object 
 12  thallium_scint       303 non-null    object 
 13  ca_disease           303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB
None

# Find out what types of data is stored in object column. Use simple regex. Leading number omitted not catered for.
# Future improvement.
r = re.compile(r'\d+[.,]\d*')
df.num_major_vessels.apply(lambda x: bool(r.match(x))).value_counts()
df.thallium_scint.apply(lambda x: bool(r.match(x))).value_counts()

df.loc[df.num_major_vessels.apply(lambda x: bool(r.match(x))) == False]
df.loc[df.thallium_scint.apply(lambda x: bool(r.match(x))) == False]

True     299
False      4
Name: num_major_vessels, dtype: int64

True     301
False      2
Name: thallium_scint, dtype: int64

# Replace all non numeric data with NaN, convert strings to numbers.
df_clean = df.apply(pd.to_numeric, errors='coerce')

# Number of null values
print(df_clean.isnull().sum())

# Actual null values
df_clean[df_clean.isnull().any(axis=1)]

age                    0
sex                    0
chest_pain_type        0
rest_blood_press       0
cholesterol            0
fasting_blood_sugar    0
rest_ecg               0
max_heart_rate         0
exer_ind_angina        0
st_depression          0
st_slope               0
num_major_vessels      4
thallium_scint         2
ca_disease             0
dtype: int64

df_fin = df_clean.copy()
median = df_fin['num_major_vessels'].median()
df_fin['num_major_vessels'].fillna(median, inplace=True)
print("Number of null values in num_major_vessels column: {}".format(df_fin['num_major_vessels'].isnull().sum()))

median = df_fin['thallium_scint'].median()
df_fin['thallium_scint'].fillna(median, inplace=True)
print("Number of null values in thallium_scint column: {}".format(df_fin['thallium_scint'].isnull().sum()))
print("Dataframe dimension: {}".format(df_fin.shape))

Number of null values in num_major_vessels column: 0
Number of null values in thallium_scint column: 0
Dataframe dimension: (303, 14)

print(df_fin.nunique())

age                     41
sex                      2
chest_pain_type          4
rest_blood_press        50
cholesterol            152
fasting_blood_sugar      2
rest_ecg                 3
max_heart_rate          91
exer_ind_angina          2
st_depression           40
st_slope                 3
num_major_vessels        4
thallium_scint           3
ca_disease               5
dtype: int64

# Extract categorical variables for analysis.
df_fin_cat = df_fin.copy()
names_cat = ('sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exer_ind_angina', 'st_slope',
             'num_major_vessels', 'thallium_scint', 'ca_disease')
df_fin_cat = df_fin_cat.loc[:, names_cat]

# Extract continuous variables for analysis.
#df_fin.reset_index(drop=True, inplace=True)
df_fin_con = df_fin.copy()
names_con = ('age', 'rest_blood_press', 'cholesterol', 'max_heart_rate', 'st_depression')
df_fin_con = df_fin_con.loc[:, names_con]

# Plotting label dictionary
plot_cat = [('sex', ['female', 'male']),
              ('chest_pain_type', ['typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic']),
              ('fasting_blood_sugar', ['fbs > 120mg', 'fbs < 120mg']),
              ('rest_ecg', ['normal', 'ST T-wave', 'left ventricular']),
              ('exer_ind_angina', ['yes', 'no']),
              ('st_slope', ['upsloping', 'flat', 'downsloping']),
              ('num_major_vessels', ['0', '1', '2', '3']),
              ('thallium_scint', ['normal', 'fixed defect', 'reversible defect']),
              ('ca_disease', ['No CA disease', 'CA disease'])]
plot_con = [('age', 'age in years'),
            ('rest_blood_press', 'blood pressure in mm Hg'),
              ('cholesterol', 'serum cholesterol in mg/d'),
              ('max_heart_rate', 'maximum heart rate achieved'),
              ('st_depression', 'ST depression by exercise relative to rest'),
              ('ca_disease', 'Coronary Artery disease')]

# Extract numeric variables for analysis.
df_fin.reset_index(drop=True, inplace=True)
df_fin_num = df_fin.copy()
names_num = ('age', 'sex', 'chest_pain_type', 'rest_blood_press', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg',
             'max_heart_rate', 'exer_ind_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thallium_scint',
             'ca_disease')
df_fin_num = df_fin_num.loc[:, names_num]

#outcome_counts = df_fin_cat.groupby('ca_disease').size()
outcome_counts = df_fin_cat['ca_disease'].value_counts()
print(outcome_counts)

# Plot outcome counts.
ax = sns.barplot(x=outcome_counts.index, y=outcome_counts.values, alpha=0.9)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
#ax.title.set_size(45)
ax.tick_params('y', labelsize = 20);
ax.tick_params('x', labelsize = 20);
plt.title('Frequency Distribution of Outcomes', fontsize=20)
plt.ylabel('Number of Occurrences')
plt.xlabel('Outcome')
plt.show();

0    164
1     55
2     36
3     35
4     13
Name: ca_disease, dtype: int64

# Transform y variable
df_fin_cat['num_bin'] = df_fin['ca_disease'].apply(lambda x: 1 if x > 0 else 0)

outcome_counts_bin = df_fin_cat['num_bin'].value_counts()
print(outcome_counts_bin)
ax = sns.barplot(x=outcome_counts_bin.index, y=outcome_counts_bin.values)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
#ax.title.set_size(45)
ax.tick_params('y', labelsize = 20);
ax.tick_params('x', labelsize = 20);
plt.title('Frequency Distribution of Outcomes', fontsize = 20)
plt.ylabel('Number of Occurrences')
plt.xlabel('Outcome')
plt.show();

0    164
1    139
Name: num_bin, dtype: int64

# Class level counts for categorical variables.
for variable in names_cat:
    print(df_fin_cat[variable].value_counts())

1.0    206
0.0     97
Name: sex, dtype: int64
4.0    144
3.0     86
2.0     50
1.0     23
Name: chest_pain_type, dtype: int64
0.0    258
1.0     45
Name: fasting_blood_sugar, dtype: int64
0.0    151
2.0    148
1.0      4
Name: rest_ecg, dtype: int64
0.0    204
1.0     99
Name: exer_ind_angina, dtype: int64
1.0    142
2.0    140
3.0     21
Name: st_slope, dtype: int64
0.0    180
1.0     65
2.0     38
3.0     20
Name: num_major_vessels, dtype: int64
3.0    168
7.0    117
6.0     18
Name: thallium_scint, dtype: int64
0    164
1     55
2     36
3     35
4     13
Name: ca_disease, dtype: int64

# Bar chart plot of categorical variables.
fig, ax = plt.subplots(3, 3, figsize=(25, 20));
for variable, subplot in zip(names_cat, ax.flatten()):
    subplot.xaxis.label.set_size(34)
    subplot.yaxis.label.set_size(34)
    subplot.tick_params('y', labelsize = 30);
    subplot.tick_params('x', labelsize = 30);
    cp = sns.countplot(x=df_fin_cat[variable], ax=subplot);
plt.tight_layout()

# Continuous density plot
df_fin_num['ca_disease'] = df_fin_num['ca_disease'].apply(lambda x: 1 if x > 0 else 0)

fig_age, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 6), squeeze=False)
_ = plotAge(df=df_fin_num, axes=axes, single_plot=True);

# 5 number summary.
df_fin_con.describe()

# Continuous density plot
#df_fin_num['ca_disease'] = df_fin_num['ca_disease'].apply(lambda x: 1 if x > 0 else 0)

fig_continuous, axes = plt.subplots(nrows=len(plot_con)-1, ncols=2, figsize=(15, 22))
_ = plotVar(isCategorical=False, categorical = plot_cat, continuous = plot_con, df=df_fin_num, axes=axes);

for variable in names_con:
    skew = df_fin_con[variable].skew()
    print("Skewness value for {}: {}".format(variable, skew))

Skewness value for age: -0.20906046882012877
Skewness value for rest_blood_press: 0.7060346498131783
Skewness value for cholesterol: 1.1355031533363453
Skewness value for max_heart_rate: -0.5374486699106463
Skewness value for st_depression: 1.2697199306019966

# Boxplot of continuous variables
medianprops = {'color': 'magenta', 'linewidth': 2}
boxprops = {'color': 'black', 'linestyle': '-', 'linewidth': 4}
whiskerprops = {'color': 'black', 'linestyle': '-', 'linewidth': 4}
capprops = {'color': 'black', 'linestyle': '-', 'linewidth': 4}
flierprops = {'color': 'black', 'marker': 'x', 'markersize': 25}

_ = df_fin_con.plot(kind='box', subplots=True, figsize=(35, 25), layout=(2,3), fontsize = 50, medianprops=medianprops,
                    boxprops=boxprops, whiskerprops=whiskerprops, capprops=capprops, flierprops=flierprops);

_ = plt.tight_layout();
_ = plt.show();

# Transform y variable
df_fin['ca_disease'] = df_fin['ca_disease'].apply(lambda x: 1 if x > 0 else 0)

# Copy original dataset
df_fin_nn = df_fin.copy()

# Group response values to form binary response
y = df_fin_nn.loc[:, 'ca_disease']

# Split data into features (X) and response (y)
X = df_fin_nn.loc[:, ('age', 'sex', 'chest_pain_type', 'rest_blood_press', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate', 'exer_ind_angina', 'st_depression', 'st_slope',
                   'num_major_vessels', 'thallium_scint')]

X.head()

# Put the response y into an array
y = np.ravel(y)

X_train_init, X_test_init, y_train_init, y_test_init = train_test_split(X, y, random_state=0)

print('Percentage holdout data: {}%'.format(round(100*(len(X_test_init)/len(X)),0)))

Percentage holdout data: 25.0%

# Initial model
logreg = LogisticRegression(max_iter=2000000, fit_intercept = False)

# Probability scores for test set
y_score_init = logreg.fit(X_train_init, y_train_init).decision_function(X_test_init)

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test_init, y_score_init)

plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

AUC: 0.8673611111111111

y_pred = logreg.predict(X_test_init)
# Accuracy before model parameter optimisation
cnf_matrix = confusion_matrix(y_pred, y_test_init)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.825  
precision:			0.799 
sensitivity:			0.867

specificity:			0.783 
negative predictive value:	0.854

false positive rate:		0.217  
false negative rate:		0.133 
false discovery rate:		0.201

# Transform thallium_scint variable
trans_thal = {3:0, 7:1, 6:2}
df_fin_nn = df_fin.replace({"thallium_scint": trans_thal})
feature_names = ('age', 'sex', 'chest_pain_type', 'rest_blood_press', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg',
                 'max_heart_rate', 'exer_ind_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thallium_scint')

# Re-extract transformed X features
X = df_fin_nn.loc[:, feature_names]


# Rebuild training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create variables for Random Forest model
X_train_rf = X_train.copy()
X_test_rf = X_test.copy()
y_train_rf = y_train.copy()
y_test_rf = y_test.copy()

# Scale data
scaler = StandardScaler()

# Fit on training data set
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Apply to test data
X_test = scaler.transform(X_test)

StandardScaler()

# Initial model
logreg = LogisticRegression(fit_intercept = False)

# Probability scores for test set
y_score = logreg.fit(X_train, y_train).decision_function(X_test)

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test, y_score)

plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

AUC: 0.8770833333333333

y_pred = logreg.predict(X_test)
# Accuracy before model parameter optimisation
cnf_matrix = confusion_matrix(y_pred, y_test)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.82  
precision:			0.805 
sensitivity:			0.844

specificity:			0.795 
negative predictive value:	0.836

false positive rate:		0.205  
false negative rate:		0.156 
false discovery rate:		0.195

# Pearson correlation plot.
correlations = df_fin_con.corr()
# Plot correlation matrix
#sns.set(font_scale=1);
fig = plt.figure(figsize=(15, 15))
ax = sns.heatmap(correlations, annot=True, linewidths = 0, vmin=-.5, cmap='pink_r')
ax.xaxis.label.set_size(15)
ax.yaxis.label.set_size(15)
#ax.title.set_size(45)
ax.tick_params('y', labelsize = 15, labelrotation=0);
ax.tick_params('x', labelsize = 15);

# Extract continuous and categorical variables for analysis.
X_con = df_fin_con.copy()
X_cat = df_fin_cat.loc[:,('sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exer_ind_angina', 'st_slope', 'num_major_vessels', 'thallium_scint')]

X_train_con, X_test_con, y_train_con, y_test_con = train_test_split(X_con, y, random_state=0)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y, random_state=0)

log_reg = sm.Logit(y_train_con, X_train_con)
log_result = log_reg.fit()

print(log_result.summary2())
features_con = np.array(names_con)
plot_feature_importance_log(fit = log_result, features = features_con)

Optimization terminated successfully.
         Current function value: 0.509602
         Iterations 6
                         Results: Logit
=================================================================
Model:              Logit            Pseudo R-squared: 0.260     
Dependent Variable: y                AIC:              241.3593  
Date:               2024-01-24 18:40 BIC:              258.4841  
No. Observations:   227              Log-Likelihood:   -115.68   
Df Model:           4                LL-Null:          -156.37   
Df Residuals:       222              LLR p-value:      8.8664e-17
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
age                0.0147   0.0176  0.8334 0.4046 -0.0199  0.0493
rest_blood_press   0.0160   0.0097  1.6516 0.0986 -0.0030  0.0349
cholesterol        0.0050   0.0034  1.4622 0.1437 -0.0017  0.0116
max_heart_rate    -0.0343   0.0066 -5.1736 0.0000 -0.0474 -0.0213
st_depression      0.7812   0.1623  4.8118 0.0000  0.4630  1.0994
=================================================================

Feature:       max_heart_rate	Score:	6.63890
Feature:        st_depression	Score:	5.82521
Feature:     rest_blood_press	Score:	1.00607
Feature:          cholesterol	Score:	0.84262
Feature:                  age	Score:	0.39296

logval = LogisticRegression(fit_intercept = False)

efs1 = EFS(logval,
           min_features=1,
           max_features=1,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X_train, y_train, custom_feature_names=feature_names)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

#efs1 = efs1.fit(X, y, custom_feature_names=feature_names)

df_efs = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df_efs.sort_values('avg_score', inplace=True, ascending=False)

metric_dict = efs1.get_metric_dict()

fig = plt.figure()
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]

upper, lower = [], []
for k in k_feat:
    upper.append(metric_dict[k]['avg_score'] +
                 metric_dict[k]['std_dev'])
    lower.append(metric_dict[k]['avg_score'] -
                 metric_dict[k]['std_dev'])

plt.fill_between(k_feat,
                 upper,
                 lower,
                 alpha=0.2,
                 color='blue',
                 lw=1)

_ = plt.plot(k_feat, avg, color='blue', marker='o');
_ = plt.ylabel('Accuracy +/- Standard Deviation', size = 15)
_ = plt.xlabel('Feature', size = 15)
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
_ = plt.xticks(k_feat,
           [str(metric_dict[k]['feature_names']) for k in k_feat],
           rotation=90, size = 15)
_ = plt.yticks(size = 15)
plt.show();

df_efs

Features: 13/13

Best accuracy score: 0.77
Best subset (indices): (12,)
Best subset (corresponding names): ('thallium_scint',)

<matplotlib.collections.PolyCollection at 0x7bbe86757590>

# Feature extraction set to retain all - we want to see scores for all variables.
test = SelectKBest(score_func=f_classif, k=5)
fit_kbest = test.fit(X_train_con, y_train_con)
features_kbest = np.array(X_con.columns)
plot_feature_importance(fit = fit_kbest, features = features_kbest)

Feature:       max_heart_rate	Score:	11.81243
Feature:        st_depression	Score:	11.80165
Feature:                  age	Score:	3.80501
Feature:     rest_blood_press	Score:	1.77980
Feature:          cholesterol	Score:	1.20672

test_cat = SelectKBest(score_func=chi2, k='all')
fit_kbest_cat = test_cat.fit(X_train_cat, y_train_cat)
features_kbest_cat = np.array(X_cat.columns)
plot_feature_importance(fit = fit_kbest_cat, features = features_kbest_cat)

Feature:    num_major_vessels	Score:	13.64256
Feature:       thallium_scint	Score:	12.12447
Feature:      exer_ind_angina	Score:	9.16471
Feature:      chest_pain_type	Score:	3.09327
Feature:             st_slope	Score:	2.11834
Feature:                  sex	Score:	1.14705
Feature:             rest_ecg	Score:	0.87596
Feature:  fasting_blood_sugar	Score:	0.06877

# Fit a Decision Tree to data - perform cross validation to obtain optimum value for hyperparameter used for pruning.
samples = [sample for sample in range(1,30)]
validation_scores = []
for sample in samples:
    classifier1 = DecisionTreeClassifier(random_state=1, min_samples_leaf=sample)
    score = cross_val_score(estimator=classifier1, X=X_train_rf, y=y_train_rf, cv=5)
    validation_scores.append(score.mean())

# Obtain the minimum leaf samples with the highest validation score
samples_optimum = samples[validation_scores.index(max(validation_scores))]

# Create final classifier
classifier2 = DecisionTreeClassifier(random_state=0, min_samples_leaf=samples_optimum)
classifier2 = classifier2.fit(X_train_rf, y_train_rf)

# Probability scores for test set
y_pred_rf = classifier2.predict(X_test_rf)

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test_rf, y_pred_rf)
plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

AUC: 0.6972222222222222

cnf_matrix = confusion_matrix(y_pred_rf, y_test_rf)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.697  
precision:			0.705 
sensitivity:			0.676

specificity:			0.718 
negative predictive value:	0.689

false positive rate:		0.282  
false negative rate:		0.324 
false discovery rate:		0.295

importances = np.array(classifier2.feature_importances_)
feature_list = np.array(X.columns)
plot_feature_importance_dec(fit = importances, features = feature_list)

Feature:       thallium_scint	Score:	0.52965
Feature:      exer_ind_angina	Score:	0.13927
Feature:        st_depression	Score:	0.12318
Feature:                  age	Score:	0.09639
Feature:      chest_pain_type	Score:	0.06617
Feature:          cholesterol	Score:	0.04124
Feature:    num_major_vessels	Score:	0.00410
Feature:             st_slope	Score:	0.00000
Feature:       max_heart_rate	Score:	0.00000
Feature:             rest_ecg	Score:	0.00000
Feature:  fasting_blood_sugar	Score:	0.00000
Feature:     rest_blood_press	Score:	0.00000
Feature:                  sex	Score:	0.00000

# Show the first few levels of the tree
_ = export_graphviz(classifier2, out_file='tree.dot',
                feature_names = X.columns,
                class_names = ['No CAD', 'CAD'],
                rounded = True, proportion = True,
                label='root',
                precision = 2, filled = True);

_ = call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
Image(filename = 'tree.png')

rand_forest = RandomForestClassifier(max_features=0.25, n_estimators=1000, criterion= 'gini',
                                     random_state=0)
rand_forest.fit(X_train_rf, y_train_rf)

# Probability scores for test set
y_pred_rf = rand_forest.predict(X_test_rf)

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test_rf, y_pred_rf)
plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

RandomForestClassifier(max_features=0.25, n_estimators=1000, random_state=0)

AUC: 0.7708333333333333

cnf_matrix = confusion_matrix(y_pred_rf, y_test_rf)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.786  
precision:			0.764 
sensitivity:			0.828

specificity:			0.745 
negative predictive value:	0.812

false positive rate:		0.255  
false negative rate:		0.172 
false discovery rate:		0.236

importances = np.array(rand_forest.feature_importances_)
feature_list = np.array(X.columns)
plot_feature_importance_dec(fit = importances, features = feature_list)

Feature:       max_heart_rate	Score:	0.12651
Feature:       thallium_scint	Score:	0.12493
Feature:        st_depression	Score:	0.11981
Feature:      chest_pain_type	Score:	0.11469
Feature:                  age	Score:	0.09253
Feature:      exer_ind_angina	Score:	0.09140
Feature:          cholesterol	Score:	0.07834
Feature:     rest_blood_press	Score:	0.07699
Feature:    num_major_vessels	Score:	0.07549
Feature:             st_slope	Score:	0.05451
Feature:                  sex	Score:	0.01734
Feature:             rest_ecg	Score:	0.01635
Feature:  fasting_blood_sugar	Score:	0.01110

estimator = rand_forest.estimators_[1]

# Show the first few levels of the tree
_ = export_graphviz(estimator, out_file='tree.dot',
                feature_names = X.columns,
                class_names = ['No CAD', 'CAD'],
                rounded = True, proportion = True,
                label='root',
                precision = 2, filled = True);

_ = call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
Image(filename = 'tree.png')

# Extract continuous and categorical variables for analysis.
X_fin_cat = df_fin_cat.loc[:,('sex', 'chest_pain_type', 'exer_ind_angina', 'st_slope', 'num_major_vessels', 'thallium_scint')]
X_fin_con = df_fin_con.loc[:,('max_heart_rate', 'st_depression')]
X_final = pd.concat([X_fin_cat, X_fin_con], axis=1)

X_train_fin, X_test_fin, y_train_fin, y_test_fin = train_test_split(X_final, y, random_state=0)

# Scale data
scaler = StandardScaler()

# Fit on training data set
scaler.fit(X_train_fin)
X_train_fin = scaler.transform(X_train_fin)

# Apply to test data
X_test_fin = scaler.transform(X_test_fin)

# Final model
logfin = LogisticRegression(fit_intercept = False)

# Probability scores for test set
y_score_fin = logfin.fit(X_train_fin, y_train_fin).decision_function(X_test_fin)

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test_fin, y_score_fin)

plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

StandardScaler()

AUC: 0.8694444444444445

y_pred_fin = logfin.predict(X_test_fin)
# Accuracy before model parameter optimisation
cnf_matrix = confusion_matrix(y_pred_fin, y_test_fin)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.87  
precision:			0.829 
sensitivity:			0.931

specificity:			0.809 
negative predictive value:	0.921

false positive rate:		0.191  
false negative rate:		0.069 
false discovery rate:		0.171

# Fit and check MSE before regularisation
reg = MLPClassifier(max_iter=50000, solver="adam", activation="tanh", hidden_layer_sizes=(5, 5), random_state=1)
reg.fit(X_train_fin, y_train_fin)

# Predict
y_pred_fin = reg.predict(X_test_fin)

# Accuracy before model parameter optimisation
accuracy_score(y_pred_fin, y_test_fin)

MLPClassifier(activation='tanh', hidden_layer_sizes=(5, 5), max_iter=50000,
              random_state=1)

0.8289473684210527

# Optimise numbers of nodes on both layers
validation_scores = {}
print("Nodes |Validation")
print("      | score")

for hidden_layer_size in [(i,j) for i in range(3,6) for j in range(3,6)]:

    reg = MLPClassifier(max_iter=1000000, hidden_layer_sizes=hidden_layer_size, random_state=1)

    score = cross_val_score(estimator=reg, X=X_train_fin, y=y_train_fin, cv=2)
    validation_scores[hidden_layer_size] = score.mean()
    print(hidden_layer_size, ": %0.5f" % validation_scores[hidden_layer_size])

Nodes |Validation
      | score
(3, 3) : 0.78850
(3, 4) : 0.82371
(3, 5) : 0.81490
(4, 3) : 0.80182
(4, 4) : 0.79728
(4, 5) : 0.80174
(5, 3) : 0.82386
(5, 4) : 0.78835
(5, 5) : 0.81490

# Check scores
print("The highest validation score is: %0.4f" % max(validation_scores.values()))
optimal_hidden_layer_size = [name for name, score in validation_scores.items()
                              if score==max(validation_scores.values())][0]
print("This corresponds to nodes", optimal_hidden_layer_size )

The highest validation score is: 0.8239
This corresponds to nodes (5, 3)

# Select range over which to find regularisation parameter - exponential used for even distribution of values
reg_par = [np.e**n for n in np.arange(-2,4,0.5)]

validation_scores = {}
print(" alpha  |  Accuracy")
for param in reg_par:
    reg = MLPClassifier(max_iter=1000000, solver="adam", activation="tanh", hidden_layer_sizes=optimal_hidden_layer_size, alpha=param,
                        random_state=1)
    score = cross_val_score(estimator=reg, X=X_train_fin, y=y_train_fin, cv=2, scoring="accuracy")
    validation_scores[param] = score.mean()
    print("%0.5f |  %s" % (param, score.mean()))

# Plot the accuracy function against regularisation parameter
plt.plot([np.log(i) for i in validation_scores.keys()], list(validation_scores.values()));
plt.xlabel("Ln of alpha");
plt.ylabel("Accuracy");

 alpha  |  Accuracy
0.13534 |  0.8324018009625834
0.22313 |  0.832479428660146
0.36788 |  0.8369042074212079
0.60653 |  0.8369042074212079
1.00000 |  0.8369430212699891
1.64872 |  0.850217357553175
2.71828 |  0.8590281012265176
4.48169 |  0.8502561714019562
7.38906 |  0.8590281012265176
12.18249 |  0.8678776587486415
20.08554 |  0.8458313926408942
33.11545 |  0.6079413134606427

max_score = ([np.log(name) for name, score in validation_scores.items() if score==max(validation_scores.values())][0])

# Find lowest value.
print("The highest accuracy score is: %s" % (max(validation_scores.values())))
print("This corresponds to regularisation parameter e**%s" % max_score)

The highest accuracy score is: 0.8678776587486415
This corresponds to regularisation parameter e**2.5

# Fit data with the best parameter
reg = MLPClassifier(max_iter=1000000, solver="adam", activation="tanh", hidden_layer_sizes=optimal_hidden_layer_size,
                    alpha=np.e**(2), random_state=1)

reg.fit(X_train_fin, y_train_fin)

# Predict
y_pred = reg.predict(X_test_fin)

# Accuracy after model parameter optimisation
accuracy_score(y_pred_fin, y_test_fin)

MLPClassifier(activation='tanh', alpha=7.3890560989306495,
              hidden_layer_sizes=(5, 3), max_iter=1000000, random_state=1)

0.8289473684210527

# Create X_design_vec, which contains the median of each respective column
X_design = X_final.copy()
X_design_vec = pd.DataFrame(X_design.median()).transpose()
X_design_vec.head()

names_num = ('sex', 'chest_pain_type', 'max_heart_rate', 'exer_ind_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thallium_scint')

for variable in names_num:
    # Set up a sequence for response variable to plot
    min_res = min(X.loc[:,variable])
    max_res = max(X.loc[:,variable])
    seq = np.linspace(start=min_res,stop=max_res,num=50)

    # Set up a list of moving resultants to plot
    to_predict = []
    for result in seq:
        X_design_vec.loc[0,variable] = result
        to_predict.append(X_design_vec.copy())

    # Convert back to dataframe
    to_predict = pd.concat(to_predict)

    # Scale and predict
    to_predict = scaler.transform(to_predict)
    predictions = reg.predict_proba(to_predict)

    # Plot
    _ = plt.plot(seq,predictions[:,1])
    _ = plt.xlabel(variable)
    _ = plt.ylabel("ca_disease")
    _ = plt.title("ca_disease vs " + variable)
    plt.show();

# False positive Rate and true positive rate
fpr_roc, tpr_roc, thresholds = roc_curve(y_test, y_pred)
plot_roc_curve(fpr = fpr_roc, tpr = tpr_roc)

AUC: 0.8236111111111112

cnf_matrix = confusion_matrix(y_pred, y_test)
plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True)

Normalized confusion matrix

accuracy:			0.842  
precision:			0.808 
sensitivity:			0.897

specificity:			0.787 
negative predictive value:	0.884

false positive rate:		0.213  
false negative rate:		0.103 
false discovery rate:		0.192

	age	rest_blood_press	cholesterol	max_heart_rate	st_depression
count	303.000000	303.000000	303.000000	303.000000	303.000000
mean	54.438944	131.689769	246.693069	149.607261	1.039604
std	9.038662	17.599748	51.776918	22.875003	1.161075
min	29.000000	94.000000	126.000000	71.000000	0.000000
25%	48.000000	120.000000	211.000000	133.500000	0.000000
50%	56.000000	130.000000	241.000000	153.000000	0.800000
75%	61.000000	140.000000	275.000000	166.000000	1.600000
max	77.000000	200.000000	564.000000	202.000000	6.200000

	feature_idx	cv_scores	avg_score	feature_names	ci_bound	std_dev	std_err
12	(12,)	[0.7391304347826086, 0.782608695652174, 0.7555...	0.76657	(thallium_scint,)	0.0213739	0.0166296	0.00831479
2	(2,)	[0.782608695652174, 0.782608695652174, 0.82222...	0.757488	(chest_pain_type,)	0.0637724	0.0496171	0.0248085
8	(8,)	[0.7391304347826086, 0.7608695652173914, 0.777...	0.748889	(exer_ind_angina,)	0.0426589	0.0331901	0.016595
7	(7,)	[0.6739130434782609, 0.7391304347826086, 0.844...	0.727053	(max_heart_rate,)	0.0806021	0.0627112	0.0313556
10	(10,)	[0.6304347826086957, 0.7391304347826086, 0.733...	0.709469	(st_slope,)	0.0581748	0.045262	0.022631
11	(11,)	[0.6956521739130435, 0.7391304347826086, 0.666...	0.709179	(num_major_vessels,)	0.0424334	0.0330146	0.0165073
9	(9,)	[0.6304347826086957, 0.6304347826086957, 0.711...	0.696618	(st_depression,)	0.0747359	0.0581471	0.0290735
0	(0,)	[0.6304347826086957, 0.717391304347826, 0.6, 0...	0.651787	(age,)	0.0671384	0.052236	0.026118
1	(1,)	[0.5434782608695652, 0.5652173913043478, 0.577...	0.581739	(sex,)	0.104977	0.0816756	0.0408378
4	(4,)	[0.5217391304347826, 0.5217391304347826, 0.577...	0.57314	(cholesterol,)	0.0681916	0.0530554	0.0265277
6	(6,)	[0.5, 0.6304347826086957, 0.4222222222222222, ...	0.554976	(rest_ecg,)	0.113244	0.088108	0.044054
3	(3,)	[0.6086956521739131, 0.4782608695652174, 0.533...	0.541836	(rest_blood_press,)	0.0540581	0.0420591	0.0210295
5	(5,)	[0.43478260869565216, 0.391304347826087, 0.511...	0.449662	(fasting_blood_sugar,)	0.0505019	0.0392922	0.0196461

	age	sex	chest_pain_type	rest_blood_press	cholesterol	fasting_blood_sugar	rest_ecg	max_heart_rate	exer_ind_angina	st_depression	st_slope	num_major_vessels	thallium_scint	ca_disease
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0
5	56.0	1.0	2.0	120.0	236.0	0.0	0.0	178.0	0.0	0.8	1.0	0.0	3.0	0
6	62.0	0.0	4.0	140.0	268.0	0.0	2.0	160.0	0.0	3.6	3.0	2.0	3.0	3
7	57.0	0.0	4.0	120.0	354.0	0.0	0.0	163.0	1.0	0.6	1.0	0.0	3.0	0
8	63.0	1.0	4.0	130.0	254.0	0.0	2.0	147.0	0.0	1.4	2.0	1.0	7.0	2
9	53.0	1.0	4.0	140.0	203.0	1.0	2.0	155.0	1.0	3.1	3.0	0.0	7.0	1
10	57.0	1.0	4.0	140.0	192.0	0.0	0.0	148.0	0.0	0.4	2.0	0.0	6.0	0
11	56.0	0.0	2.0	140.0	294.0	0.0	2.0	153.0	0.0	1.3	2.0	0.0	3.0	0
12	56.0	1.0	3.0	130.0	256.0	1.0	2.0	142.0	1.0	0.6	2.0	1.0	6.0	2
13	44.0	1.0	2.0	120.0	263.0	0.0	0.0	173.0	0.0	0.0	1.0	0.0	7.0	0
14	52.0	1.0	3.0	172.0	199.0	1.0	0.0	162.0	0.0	0.5	1.0	0.0	7.0	0
15	57.0	1.0	3.0	150.0	168.0	0.0	0.0	174.0	0.0	1.6	1.0	0.0	3.0	0
16	48.0	1.0	2.0	110.0	229.0	0.0	0.0	168.0	0.0	1.0	3.0	0.0	7.0	1
17	54.0	1.0	4.0	140.0	239.0	0.0	0.0	160.0	0.0	1.2	1.0	0.0	3.0	0
18	48.0	0.0	3.0	130.0	275.0	0.0	0.0	139.0	0.0	0.2	1.0	0.0	3.0	0
19	49.0	1.0	2.0	130.0	266.0	0.0	0.0	171.0	0.0	0.6	1.0	0.0	3.0	0

	age	sex	chest_pain_type	rest_blood_press	cholesterol	fasting_blood_sugar	rest_ecg	max_heart_rate	exer_ind_angina	st_depression	st_slope	num_major_vessels	thallium_scint	ca_disease
166	52.0	1.0	3.0	138.0	223.0	0.0	0.0	169.0	0.0	0.0	1.0	?	3.0	0
192	43.0	1.0	4.0	132.0	247.0	1.0	2.0	143.0	1.0	0.1	2.0	?	7.0	1
287	58.0	1.0	2.0	125.0	220.0	0.0	0.0	144.0	0.0	0.4	2.0	?	7.0	0
302	38.0	1.0	3.0	138.0	175.0	0.0	0.0	173.0	0.0	0.0	1.0	?	3.0	0

	age	sex	chest_pain_type	rest_blood_press	cholesterol	fasting_blood_sugar	rest_ecg	max_heart_rate	exer_ind_angina	st_depression	st_slope	num_major_vessels	thallium_scint	ca_disease
87	53.0	0.0	3.0	128.0	216.0	0.0	2.0	115.0	0.0	0.0	1.0	0.0	?	0
266	52.0	1.0	4.0	128.0	204.0	1.0	0.0	156.0	1.0	1.0	2.0	0.0	?	2