import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Useful Parameters :
clr = 'orangered'
title_size = 25
Import Data :
perform_df = pd.read_csv('bodyPerformance.csv')
perform_df.head()
perform_df.isnull().sum()
Data Cleaning :
chenge_dtypes = ['age','diastolic','systolic','sit-ups counts','broad jump_cm']
for i in chenge_dtypes:
perform_df[i] = perform_df[i].astype('int')
A_performance = perform_df[perform_df['class'] == 'A']
B_performance = perform_df[perform_df['class'] == 'B']
C_performance = perform_df[perform_df['class'] == 'C']
D_performance = perform_df[perform_df['class'] == 'D']
fig, axes = plt.subplots(2, 2, figsize=(18,10), sharey=True);
sns.histplot(ax=axes[0][0],x='age',data=A_performance,color=clr);
axes[0][0].set_title('Age distribution for best health');
sns.histplot(ax=axes[0][1],x='age',data=B_performance,color=clr);
axes[0][1].set_title('Age distribution for good health');
sns.histplot(ax=axes[1][0],x='age',data=C_performance,color=clr);
axes[1][0].set_title('Age distribution for average health');
sns.histplot(ax=axes[1][1],x='age',data=D_performance,color=clr);
axes[1][1].set_title('Age distribution for poor health');
2]Gender
fig, axes = plt.subplots(2, 2, figsize=(18,10), sharey=True);
axes[0][0].pie(A_performance.gender.value_counts(),labels =['Male','Female'],autopct='%.0f%%',shadow=True,colors=['deepskyblue','springgreen'])
axes[0][0].set_title('Gender distribution for best health');
axes[0][1].pie(B_performance.gender.value_counts(),labels =['Male','Female'],autopct='%.0f%%',shadow=True,colors=['deepskyblue','springgreen'])
axes[0][1].set_title('Gender distribution for best health');
axes[1][0].pie(C_performance.gender.value_counts(),labels =['Male','Female'],autopct='%.0f%%',shadow=True,colors=['deepskyblue','springgreen'])
axes[1][0].set_title('Gender distribution for best health');
axes[1][1].pie(D_performance.gender.value_counts(),labels =['Male','Female'],autopct='%.0f%%',shadow=True,colors=['deepskyblue','springgreen'])
axes[1][1].set_title('Gender distribution for best health');
3]Height
fig, axes = plt.subplots(2, 2, figsize=(18,10), sharey=True);
sns.histplot(ax=axes[0][0],x='height_cm',data=A_performance,color=clr);
axes[0][0].set_title('Height distribution for best health');
sns.histplot(ax=axes[0][1],x='height_cm',data=B_performance,color=clr);
axes[0][1].set_title('Height distribution for good health');
sns.histplot(ax=axes[1][0],x='height_cm',data=C_performance,color=clr);
axes[1][0].set_title('Height distribution for average health');
sns.histplot(ax=axes[1][1],x='height_cm',data=D_performance,color=clr);
axes[1][1].set_title('Height distribution for poor health');
4]Weight
fig, axes = plt.subplots(2, 2, figsize=(18,10), sharey=True);
sns.histplot(ax=axes[0][0],x='weight_kg',data=A_performance,color=clr);
axes[0][0].set_title('Weight distribution for best health');
sns.histplot(ax=axes[0][1],x='weight_kg',data=B_performance,color=clr);
axes[0][1].set_title('Weight distribution for good health');
sns.histplot(ax=axes[1][0],x='weight_kg',data=C_performance,color=clr);
axes[1][0].set_title('Weight distribution for average health');
sns.histplot(ax=axes[1][1],x='weight_kg',data=D_performance,color=clr);
axes[1][1].set_title('Weight distribution for poor health');
sns.countplot(x='class',data=perform_df);
def fun(df):
if df == 'M':
return 0
else:
return 1
perform_df['gender'] = perform_df.gender.apply(fun)
perform_df.head()
def target_fun(df):
if df=='A':
return 0
elif df=='B':
return 1
elif df=='C':
return 2
else:
return 3
perform_df['class'] = perform_df['class'].apply(target_fun)
perform_df.head()
X = perform_df.drop('class',axis=1)
y = perform_df['class']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=11)
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
Model Training
models = {
LogisticRegression(max_iter=500):'Logistic Regression',
SVC():"Support Vector Machine",
RandomForestClassifier():'Random Forest'
}
for m in models.keys():
m.fit(X_train,y_train)
for model,name in models.items():
print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")
Try some other models and stacking technique
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier()
adaboost.fit(X_train,y_train)
adaboost.score(X_test,y_test)
Output : 0.6047
level0 = list()
level0.append(('lr', LogisticRegression(max_iter=1000)))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))
level1 = RandomForestClassifier()
stack_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
stack_model.fit(X_train,y_train)
stack_model.score(X_test,y_test)
Output : 0.71257
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
Confusion Matrix
y_pred = model.predict(X_test)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
cnf_matrix = confusion_matrix(y_test,y_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True,fmt = 'd')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title(f'Confusion Matrix for Random Forest', {'fontsize':title_size})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
Feature Importance
feature = pd.Series(rf.feature_importances_, index = X.columns).sort_values(ascending = False)
plt.figure(figsize = (10,6))
sns.barplot(x = feature, y = feature.index)
plt.title("Feature Importance")
plt.xlabel('Score')
plt.ylabel('Features')
plt.show()
Thank You ):