import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm, neighbors
from sklearn.ensemble import VotingClassifier
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
import io
import random as rnd
import os

df = pd.read_csv('heart.csv')

df.head()

df.tail()

import missingno as msn
msn.bar(df)
# Checking for missing values.
df.isnull().values.any()

False

ax = sns.countplot(x="sex", data=df)

# Add count labels to bars
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2, height+0.3,'{:.0f}'.format(height), ha="center")

import plotly.express as px

Hdislabel = ['Have heart disease','Do not have heart disease']
val_counts = [508,409] 
fig = px.pie(values=val_counts,names=Hdislabel,
             color=Hdislabel,
             color_discrete_map={'Have heart disease':'red',
                                 'Do not have heart disease':'#13F306'},
             title='Heart disease count')

fig.show()

def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='ORANGE')
        ax.set_title(feature+" Distribution",color='DarkRed')
        
    fig.tight_layout()  
    plt.show()
draw_histograms(df,df.columns,6,3)

sns.pairplot(data=df)

<seaborn.axisgrid.PairGrid at 0x1778f4490f0>

# Checking for any correlations.
df.corr()

plt.figure(figsize = (20, 20))
sns.heatmap(df.corr(),annot=True)
plt.title('Fig: Annotated values of correlation coefficient of each pair of features', y=-0.23)

Text(0.5, -0.23, 'Fig: Annotated values of correlation coefficient of each pair of features')

p=sns.pairplot(df, hue = 'target')

# Splitting the dataset into training and testing sets.
x = df.iloc[:, :-2]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.35)

print("Shape of X before Dimensionality Reduction: ", x_train.shape)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()  # Linear Discriminant Analysis
x_train = lda.fit_transform(x_train, y_train)
x_test = lda.transform(x_test)

print("Shape of X after Dimensionality Reduction: ", x_train.shape)

Shape of X before Dimensionality Reduction:  (666, 12)
Shape of X after Dimensionality Reduction:  (666, 1)

# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Train individual models
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(x_train, y_train)

knn_model = neighbors.KNeighborsClassifier(n_neighbors=9, p = 2, metric = 'euclidean')
knn_model.fit(x_train, y_train)

rf_model = RandomForestClassifier(n_estimators=5,random_state=6, max_depth=5)
rf_model.fit(x_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=5, random_state=6)

RandomForestClassifier(max_depth=5, n_estimators=5, random_state=6)

# Make predictions on the test set
svm_pred = svm_model.predict(x_test)
knn_pred = knn_model.predict(x_test)
rf_pred = rf_model.predict(x_test)

# Evaluate individual models
svm_accuracy = accuracy_score(y_test, svm_pred)
knn_accuracy = accuracy_score(y_test, knn_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("SVM Accuracy:", svm_accuracy)
print("KNN Accuracy:", knn_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

SVM Accuracy: 0.8746518105849582
KNN Accuracy: 0.8495821727019499
Random Forest Accuracy: 0.8802228412256268

import matplotlib.pyplot as plt

# Accuracies of individual models
accuracies = [knn_accuracy, rf_accuracy, svm_accuracy]

# Model names
models = ['KNN', 'Random Forest', 'SVM']

# Plotting the bar plot
plt.bar(models, accuracies)
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy of Heart Disease Detection Models')
plt.ylim(0.8, 1)  # Adjust the y-axis limits if needed

# Add value annotations on top of each bar
for i, v in enumerate(accuracies):
    plt.text(i, v, f'{v*100:.2f}%', ha='center', va='bottom')

plt.show()

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# Calculate the confusion matrix for each model
knn_cm = confusion_matrix(y_test, knn_pred)
rf_cm = confusion_matrix(y_test, rf_pred)
svm_cm = confusion_matrix(y_test, svm_pred)

# Function to print the confusion matrix
def print_confusion_matrix(cm, model_name):
    plt.figure(figsize=(8, 5))
    plt.imshow(cm, interpolation='nearest', cmap="Oranges")  # Use the "Oranges" colormap for orange color
    plt.title(f'Confusion Matrix - {model_name}')
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['No Heart Disease', 'Heart Disease'])
    plt.yticks(tick_marks, ['No Heart Disease', 'Heart Disease'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    for i in range(2):
        for j in range(2):
            plt.text(j, i, cm[i, j], ha='center', va='center', color='white' if cm[i, j] > cm.max() / 2 else 'black')
    plt.show()

# Print the confusion matrices
print_confusion_matrix(knn_cm, 'KNN')
print_confusion_matrix(rf_cm, 'Random Forest')
print_confusion_matrix(svm_cm, 'SVM')

from sklearn.ensemble import GradientBoostingClassifier  # Import the GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score

# Define the KFold cross-validation object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the machine learning model (in this example, GradientBoostingClassifier)
boosted_model = GradientBoostingClassifier()

# Initialize empty lists to store the accuracies, AUCs, and precisions
accuracies = []
aucs = []
precisions = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(x):  # Replace 'x' with your feature matrix
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model to the training data
    boosted_model.fit(X_train, y_train)

    # Predict the target variable for the test data
    y_pred = boosted_model.predict(X_test)

    # Predict the target variable probabilities for the test data
    y_proba = boosted_model.predict_proba(X_test)[:, 1]

    # Compute the accuracy score and store it in the list
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    # Compute precision score and store it into the list
    precision = precision_score(y_test, y_pred)
    precisions.append(precision)

    # Compute the AUC score and store it in the list
    auc = roc_auc_score(y_test, y_proba)
    aucs.append(auc)

    # Print the accuracy, precision, and AUC scores for the current fold
    print('Accuracy: %.3f, Precision: %.3f, AUC: %.3f' % (accuracy, precision, auc))

# Compute the mean accuracy, precision, and AUC across all folds
mean_accuracy = sum(accuracies) / len(accuracies)
mean_precision = sum(precisions) / len(precisions)
mean_auc = sum(aucs) / len(aucs)
print('Mean accuracy: %.3f, Mean precision: %.3f, Mean AUC: %.3f' % (mean_accuracy, mean_precision, mean_auc))

Accuracy: 0.961, Precision: 0.990, AUC: 0.983
Accuracy: 0.990, Precision: 1.000, AUC: 0.996
Accuracy: 0.976, Precision: 0.974, AUC: 0.998
Accuracy: 0.995, Precision: 1.000, AUC: 0.999
Accuracy: 0.922, Precision: 0.941, AUC: 0.987
Mean accuracy: 0.969, Mean precision: 0.981, Mean AUC: 0.993

# Create an array representing the fold numbers
fold_numbers = np.arange(1, len(accuracies) + 1)

# Plot the accuracies
plt.plot(fold_numbers, accuracies, label='Accuracy')

# Plot the precisions
plt.plot(fold_numbers, precisions, label='Precision')

# Plot the AUCs
plt.plot(fold_numbers, aucs, label='AUC')

# Set the labels and title
plt.xlabel('Fold')
plt.ylabel('Metric Value')
plt.title('K-Fold Cross-Validation Results')

# Display the legend
plt.legend()

# Show the plot
plt.show()

'''Manual testing of an actual heart disease patient's reports'''
# answer must be 1 (Disease risk detected)
import numpy as np
myData = np.array([[23, 0, 1, 127, 232, 0, 1, 168, 0, 1, 2, 2]])  # Reshape to a 2D array

# Predict using the trained model
prediction = boosted_model.predict(myData)

if prediction[0]:
    print("Caution! Heart Disease Risk is high, kindly book a checkup with your doctor.")
else:
    print("Congratulations! Your heart seems to be in perfect condition.")

Caution! Heart Disease Risk is high, kindly book a checkup with your doctor.

''' Manual testing for a healthy adult '''

secondData = np.array([[53, 0, 0, 146, 298, 1, 0, 168, 0, 1.8, 2, 3]])
ans = boosted_model.predict(secondData)
if not ans:
    print("Congratulations! Your heart seems to be in perfect condition.")
else:
    print("Caution! Heart Disease Risk is high, kindly book a checkup with your doctor.")

Congratulations! Your heart seems to be in perfect condition.

''' Manual testing for a healthy adult '''

secondData = np.array([[53, 0, 1, 136, 304, 1, 1, 172, 0, 0, 2, 0]])

                        
ans = boosted_model.predict(secondData)
if not ans:
    print("Congratulations! Your heart seems to be in perfect condition.")
else:
    print("Caution! Heart Disease Risk is high, kindly book a checkup with your doctor.")

Caution! Heart Disease Risk is high, kindly book a checkup with your doctor.

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	52	1	125	212	0	1	168	0	1.0	2	2	3
1	53	1	140	203	1	0	155	1	3.1	0	0	3
2	70	1	145	174	0	1	125	1	2.6	0	0	3
3	61	1	148	203	0	1	161	0	0.0	2	1	3
4	62	0	138	294	1	1	106	0	1.9	1	3	2

	age	sex	cp	trestbps	chol	restecg	thalach	exang	oldpeak	slope	ca	thal	target
1020	59	1	1	140	221	1	164	1	0.0	2	0	2	1
1021	60	1	0	125	258	0	141	1	2.8	1	1	3	0
1022	47	1	0	110	275	0	118	1	1.0	1	1	2	0
1023	50	0	0	110	254	0	159	0	0.0	2	0	2	1
1024	54	1	0	120	188	1	113	0	1.4	1	1	3	0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	target
age	1.000000	-0.103240	-0.071966	0.271121	0.219823	0.121243	-0.132696	-0.390227	0.088163	0.208137	-0.169105	0.271551	0.072297	-0.229324
sex	-0.103240	1.000000	-0.041119	-0.078974	-0.198258	0.027200	-0.055117	-0.049365	0.139157	0.084687	-0.026666	0.111729	0.198424	-0.279501
cp	-0.071966	-0.041119	1.000000	0.038177	-0.081641	0.079294	0.043581	0.306839	-0.401513	-0.174733	0.131633	-0.176206	-0.163341	0.434854
trestbps	0.271121	-0.078974	0.038177	1.000000	0.127977	0.181767	-0.123794	-0.039264	0.061197	0.187434	-0.120445	0.104554	0.059276	-0.138772
chol	0.219823	-0.198258	-0.081641	0.127977	1.000000	0.026917	-0.147410	-0.021772	0.067382	0.064880	-0.014248	0.074259	0.100244	-0.099966
fbs	0.121243	0.027200	0.079294	0.181767	0.026917	1.000000	-0.104051	-0.008866	0.049261	0.010859	-0.061902	0.137156	-0.042177	-0.041164
restecg	-0.132696	-0.055117	0.043581	-0.123794	-0.147410	-0.104051	1.000000	0.048411	-0.065606	-0.050114	0.086086	-0.078072	-0.020504	0.134468
thalach	-0.390227	-0.049365	0.306839	-0.039264	-0.021772	-0.008866	0.048411	1.000000	-0.380281	-0.349796	0.395308	-0.207888	-0.098068	0.422895
exang	0.088163	0.139157	-0.401513	0.061197	0.067382	0.049261	-0.065606	-0.380281	1.000000	0.310844	-0.267335	0.107849	0.197201	-0.438029
oldpeak	0.208137	0.084687	-0.174733	0.187434	0.064880	0.010859	-0.050114	-0.349796	0.310844	1.000000	-0.575189	0.221816	0.202672	-0.438441
slope	-0.169105	-0.026666	0.131633	-0.120445	-0.014248	-0.061902	0.086086	0.395308	-0.267335	-0.575189	1.000000	-0.073440	-0.094090	0.345512
ca	0.271551	0.111729	-0.176206	0.104554	0.074259	0.137156	-0.078072	-0.207888	0.107849	0.221816	-0.073440	1.000000	0.149014	-0.382085
thal	0.072297	0.198424	-0.163341	0.059276	0.100244	-0.042177	-0.020504	-0.098068	0.197201	0.202672	-0.094090	0.149014	1.000000	-0.337838
target	-0.229324	-0.279501	0.434854	-0.138772	-0.099966	-0.041164	0.134468	0.422895	-0.438029	-0.438441	0.345512	-0.382085	-0.337838	1.000000

#ABSTRACT¶

PERFORMING K FOLD CROSS VALIDATION TO CHECK OVERFITTING¶

MANUAL TESTING DATA OBTAINED FROM AN ACTUAL PATIENT ¶

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	52	1	125	212	0	1	168	0	1.0	2	2	3
1	53	1	140	203	1	0	155	1	3.1	0	0	3
2	70	1	145	174	0	1	125	1	2.6	0	0	3
3	61	1	148	203	0	1	161	0	0.0	2	1	3
4	62	0	138	294	1	1	106	0	1.9	1	3	2

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	52	1	125	212	0	1	168	0	1.0	2	2	3
1	53	1	140	203	1	0	155	1	3.1	0	0	3
2	70	1	145	174	0	1	125	1	2.6	0	0	3
3	61	1	148	203	0	1	161	0	0.0	2	1	3
4	62	0	138	294	1	1	106	0	1.9	1	3	2

#ABSTRACT¶

PERFORMING K FOLD CROSS VALIDATION TO CHECK OVERFITTING¶

** MANUAL TESTING DATA OBTAINED FROM AN ACTUAL PATIENT **¶

MANUAL TESTING DATA OBTAINED FROM AN ACTUAL PATIENT ¶

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal
0	52	1	125	212	0	1	168	0	1.0	2	2	3
1	53	1	140	203	1	0	155	1	3.1	0	0	3
2	70	1	145	174	0	1	125	1	2.6	0	0	3
3	61	1	148	203	0	1	161	0	0.0	2	1	3
4	62	0	138	294	1	1	106	0	1.9	1	3	2