# manipulation data
import pickle
import pandas as pd
import numpy as np
#visualiation data
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import plotly.graph_objects as go
import plotly.express as px
# Model evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
#default theme
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'
train = pd.read_csv('heart_failure_clinical_records_dataset.csv')
train.head(6)
train.info()
train.dtypes.value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('type of our data')
train.describe()
train.isnull().sum()
train.hist(figsize=(15,15),edgecolor='black')
train.DEATH_EVENT.value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('the % of deaths')
plt.figure(figsize=(20,6))
sns.countplot(x='age',data=train)
plt.xticks(rotation=90)
plt.title('the ages of our persone')
Distribution Of Age
fig = go.Figure()
fig.add_trace(go.Histogram(
x = train['age'],
xbins=dict( # bins used for histogram
start=40,
end=95,
size=2
),
marker_color='#e8ab60',
opacity=1
))
fig.update_layout(
title_text='Distribution of Age',
xaxis_title_text='AGE',
yaxis_title_text='COUNT',
bargap=0.05, # gap between bars of adjacent location coordinates
xaxis = {'showgrid': False },
yaxis = {'showgrid': False },
template = 'presentation'
)
fig.show()
Distribution of AGE Vs DEATH_EVENT
fig = px.histogram(train, x="age", color="DEATH_EVENT", marginal="violin", hover_data=train.columns,
title ="Distribution of AGE Vs DEATH_EVENT",
labels={"age": "AGE"},
template="plotly",)
fig.show()
sns.boxplot(x = train.ejection_fraction, color = 'green')
plt.show()
We can see there are two outliers. Lets remove them (70 and 80)
train[train['ejection_fraction']>=70]
train = train[train['ejection_fraction']<70]
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(
x = train['ejection_fraction'],
xbins=dict( # bins used for histogram
start=14,
end=80,
size=2
),
marker_color='#A7F432',
opacity=1
))
fig.update_layout(
title_text='EJECTION FRACTION DISTRIBUTION',
xaxis_title_text='EJECTION FRACTION',
yaxis_title_text='COUNT',
bargap=0.05, # gap between bars of adjacent location coordinates
template = 'plotly_dark'
)
fig.show()
sns.boxplot(x=train.time, color = 'yellow')
plt.show()
No outliers in time
sns.boxplot(x=train.serum_creatinine, color = 'red')
plt.show()
Before dealing with outliers we require knowledge about the outlier, the dataset and possibly some domain knowledge.
Removing outliers without a good reason will not always increase accuracy. Without a deep understanding of what are the possible ranges that
exist within each feature, removing outliers becomes tricky.
When I researched a bit I found that all the values in serum_creatinine falls in possible range of values. So they are not outliers.
They are actual data points that helps in predicting DEATH_EVENT.
train.corr().style.background_gradient(cmap='coolwarm').set_precision(2)
# Feature Selection
plt.rcParams['figure.figsize']=15,6
sns.set_style("darkgrid")
x = train.iloc[:, :-1]
y = train.iloc[:,-1]
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()
We will select only 3 features : time, ejection_fraction, serum_creatinine
train=train.drop(['anaemia','creatinine_phosphokinase','diabetes','high_blood_pressure','platelets','sex','smoking','age'],axis=1)
print(train)
train.corr().style.background_gradient(cmap='coolwarm').set_precision(3)
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
x=train.drop('DEATH_EVENT',axis=1)
y=train.DEATH_EVENT
print(x.shape)
print(y.shape)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
print(x_train)
print(y_test)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
pickle.dump(sc, open('sc.pkl','wb'))
# Making Confusion Matrix and calculating accuracy score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
model = LogisticRegression()
#Fit the model
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mylist = []
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
# accuracy score
acc_logreg = accuracy_score(y_test, y_pred)
mylist.append(acc_logreg)
print(cm)
print(acc_logreg)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
# Finding the optimum number of neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for neighbors in range(3,10):
classifier = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(3,10)), list1)
plt.show()
# Training the K Nearest Neighbor Classifier on the Training set
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(x_test)
print(y_pred)
# Making the confusion matrix and calculating accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_knn = accuracy_score(y_test, y_pred)
mylist.append(acc_knn)
print(cm)
print(acc_knn)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0]:
classifier = SVC(C = c, random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0], list1)
plt.show()
# Training the Support Vector Classifier on the Training set
from sklearn.svm import SVC
classifier = SVC(C = 0.7, random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)
# Predicting the test set results
y_pred = classifier.predict(x_test)
print(y_pred)
# Making the confusion matrix and calculating accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_svc = accuracy_score(y_test, y_pred)
print(cm)
print(acc_svc)
mylist.append(acc_svc)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
# Finding the optimum number of max_leaf_nodes
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for leaves in range(2,15):
classifier = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(2,15)), list1)
plt.show()
# Training the Decision Tree Classifier on the Training set
classifier = DecisionTreeClassifier(max_leaf_nodes = 10, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)
# Predicting the test set results
y_pred = classifier.predict(x_test)
print(y_pred)
# Making the confusion matrix and calculating accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_decisiontree = accuracy_score(y_test, y_pred)
print(cm)
print(acc_decisiontree)
mylist.append(acc_decisiontree)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
#Finding the optimum number of n_estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30):
classifier = RandomForestClassifier(n_estimators = estimators, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30)), list1)
plt.show()
# Training the RandomForest Classifier on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 15, criterion='entropy', random_state=0)
classifier.fit(x_train,y_train)
# Predicting the test set results
y_pred = classifier.predict(x_test)
print(y_pred)
# Making the confusion matrix and calculating the accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_randomforest = accuracy_score(y_test, y_pred)
mylist.append(acc_randomforest)
print(cm)
print(acc_randomforest)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
np.random.seed(0)
import tensorflow as tf
# Initialising the ANN
ann = tf.keras.models.Sequential()
# Adding the input layer and the first hidden layer
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
# Adding the second hidden layer
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
# Adding the third hidden layer
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
# Adding the fourth hidden layer
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
# Adding the output layer
ann.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
# Compiling the ANN
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy' , metrics = ['accuracy'] )
# Training the ANN on the training set
ann.fit(x_train, y_train, batch_size = 16, epochs = 100)
# Predicting the test set results
y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
np.set_printoptions()
# Making the confusion matrix, calculating accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score
# confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix")
print(cm)
print()
# accuracy
ac_ann = accuracy_score(y_test,y_pred)
print("Accuracy")
print(ac_ann)
mylist.append(ac_ann)
# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(pd.Series({"Accuracy": acc,
"ROC-AUC": roc_auc,
"Precision": precision,
"Recall": recall,
"F1-score": f1}).to_string())
Checking For The Accuracy Score
models = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'ANN',
'Decision Tree','xgboost','catboost'],
'Score': [acc_svc, acc_knn, acc_logreg,
acc_randomforest, ac_ann, acc_decisiontree,ac_xgboost,ac_catboost
]})
models.sort_values(by='Score', ascending=False)
Accuracy Of Different Classifier Models
plt.rcParams['figure.figsize']=15,6
sns.set_style("darkgrid")
ax = sns.barplot(x=models.Model, y=models.Score, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()