import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")
Load Dataset:
health_df = pd.read_csv('Maternal Health Risk.csv')
health_df.head()
Dataset contains 1014 rows and 7 column.
EDA:
Risk Level:
sns.countplot(health_df.RiskLevel);
plt.title("Risk Level Feature",{'fontsize':20});
high_risk = health_df[health_df.RiskLevel == 'high risk']
low_risk = health_df[health_df.RiskLevel == 'low risk']
mid_risk = health_df[health_df.RiskLevel == 'mid risk']
Age:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
sns.histplot(ax=axes[0],x='Age',data=high_risk,color='b');
axes[0].set_title("High Risk Level");
sns.histplot(ax=axes[1],x='Age',data=low_risk,color='b');
axes[1].set_title("Low Risk Level");
sns.histplot(ax=axes[2],x='Age',data=mid_risk,color='b');
axes[2].set_title("Mid Risk Level");
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
sns.histplot(ax=axes[0],x='SystolicBP',data=high_risk,color='b');
axes[0].set_title("High Risk Level");
sns.histplot(ax=axes[1],x='SystolicBP',data=low_risk,color='b');
axes[1].set_title("Low Risk Level");
sns.histplot(ax=axes[2],x='SystolicBP',data=mid_risk,color='b');
axes[2].set_title("Mid Risk Level");
Split Dataset:
X = health_df.drop('RiskLevel',axis=1)
y = health_df.RiskLevel
Convert Categorical variables into Numeric:
def fun(df):
if df == 'high risk':
return 0
elif df == 'low risk':
return 1
else:
return 2
y = y.apply(fun)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1)
Scale Data:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
Model Training:
models = {
SVC():"Support Vector Machine",
RandomForestClassifier():'Random Forest',
LogisticRegression(max_iter=3000):'Logistic Regression'
}
for m in models.keys():
m.fit(X_train,y_train)
for model,name in models.items():
print(f"Accuracy Score for {name} is : ",model.score(X_test,y_test)*100,"%")
Model Evaluation:
Classification Report:
Class 0 : High Risk
Class 1 : Low Risk
Class 2 : Mid Risk
for model,name in models.items():
y_pred = model.predict(X_test)
print(f"Classification Report for {name}")
print("----------------------------------------------------------")
print(classification_report(y_test,y_pred))
print("----------------------------------------------------------")
Random forest performs better and gives high accuracy on test data as compared to other models (Logistic Regression,SVM).
So, we use Random Forest for our prediction.
Thank You ):