0 Breathing Problem
1 Fever
2 Dry Cough
3 Sore throat
4 Running Nose
5 Asthma
6 Chronic Lung Disease
7 Headache
8 Heart Disease
9 Diabetes
10 Hyper Tension
11 Fatigue
12 Gastrointestinal
13 Abroad travel
14 Contact with COVID Patient
15 Attended Large Gathering
16 Visited Public Exposed Places
17 Family working in Public Exposed Places
18 Wearing Masks
19 Sanitization from Market
20 COVID-19: Whether the patient suffer from covid-19 or not
These column containing two category Yes or No
import pandas as pd
import numpy as np
import pickle
# data visualization library
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'
# Model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
2. Reading Dataset
dataset = pd.read_csv('Covid Dataset.csv')
dataset.head()
3. Data Preprocessing
Checking For Description Of Dataset
dataset.describe(include='all')
Shape Of The Dataset
print(dataset.shape)
Checking For Unique Value Present In The Columns
print(dataset.nunique())
Checking For Count Of The Label Column
print(dataset[dataset['COVID-19'] == 'Yes'].count())
print(dataset[dataset['COVID-19'] == 'No'].count())
4. Label Encoding
Label Encoding used to convert non-numerical columns to categorical columns
#Label Encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
Breathing_Problem_encoder=LabelEncoder()
dataset['Breathing Problem'] = Breathing_Problem_encoder.fit_transform(dataset['Breathing Problem'])
pickle.dump(Breathing_Problem_encoder, open('Breathing_Problem_encoder.pkl','wb'))
Fever_encoder=LabelEncoder()
dataset['Fever'] = Fever_encoder.fit_transform(dataset['Fever'])
pickle.dump(Fever_encoder, open('Fever_encoder.pkl','wb'))
Dry_Cough_encoder=LabelEncoder()
dataset['Dry Cough'] = Dry_Cough_encoder.fit_transform(dataset['Dry Cough'])
pickle.dump(Dry_Cough_encoder, open('Dry_Cough_encoder.pkl','wb'))
Sore_throat_encoder=LabelEncoder()
dataset['Sore throat'] = Sore_throat_encoder.fit_transform(dataset['Sore throat'])
pickle.dump(Sore_throat_encoder, open('Sore_throat_encoder.pkl','wb'))
Running_Nose_encoder=LabelEncoder()
dataset['Running Nose'] = Running_Nose_encoder.fit_transform(dataset['Running Nose'])
pickle.dump(Running_Nose_encoder, open('Running_Nose_encoder.pkl','wb'))
Asthma_encoder=LabelEncoder()
dataset['Asthma'] = Asthma_encoder.fit_transform(dataset['Asthma'])
pickle.dump(Asthma_encoder, open('Asthma_encoder.pkl','wb'))
Chronic_Lung_Disease_encoder=LabelEncoder()
dataset['Chronic Lung Disease'] = Chronic_Lung_Disease_encoder.fit_transform(dataset['Chronic Lung Disease'])
pickle.dump(Chronic_Lung_Disease_encoder, open('Chronic_Lung_Disease_encoder.pkl','wb'))
Headache_encoder=LabelEncoder()
dataset['Headache'] = Headache_encoder.fit_transform(dataset['Headache'])
pickle.dump(Headache_encoder, open('Headache_encoder.pkl','wb'))
Heart_Disease_encoder=LabelEncoder()
dataset['Heart Disease'] = Heart_Disease_encoder.fit_transform(dataset['Heart Disease'])
pickle.dump(Heart_Disease_encoder, open('Heart_Disease_encoder.pkl','wb'))
Diabetes_encoder=LabelEncoder()
dataset['Diabetes'] = Diabetes_encoder.fit_transform(dataset['Diabetes'])
pickle.dump(Diabetes_encoder, open('Diabetes_encoder.pkl','wb'))
Hyper_Tension_encoder=LabelEncoder()
dataset['Hyper Tension'] = Hyper_Tension_encoder.fit_transform(dataset['Hyper Tension'])
pickle.dump(Hyper_Tension_encoder, open('Hyper_Tension_encoder.pkl','wb'))
Abroad_travel_encoder=LabelEncoder()
dataset['Abroad travel'] = Abroad_travel_encoder.fit_transform(dataset['Abroad travel'])
pickle.dump(Abroad_travel_encoder, open('Abroad_travel_encoder.pkl','wb'))
Contact_with_COVID_Patient_encoder=LabelEncoder()
dataset['Contact with COVID Patient'] = Contact_with_COVID_Patient_encoder.fit_transform(dataset['Contact with COVID Patient'])
pickle.dump(Contact_with_COVID_Patient_encoder, open('Contact_with_COVID_Patient_encoder.pkl','wb'))
Attended_Large_Gathering_encoder=LabelEncoder()
dataset['Attended Large Gathering'] = Attended_Large_Gathering_encoder.fit_transform(dataset['Attended Large Gathering'])
pickle.dump(Attended_Large_Gathering_encoder, open('Attended_Large_Gathering_encoder.pkl','wb'))
Visited_Public_Exposed_Places_encoder=LabelEncoder()
dataset['Visited Public Exposed Places'] = Visited_Public_Exposed_Places_encoder.fit_transform(dataset['Visited Public Exposed Places'])
pickle.dump(Visited_Public_Exposed_Places_encoder, open('Visited_Public_Exposed_Places_encoder.pkl','wb'))
Family_working_in_Public_Exposed_Places_encoder=LabelEncoder()
dataset['Family working in Public Exposed Places'] = Family_working_in_Public_Exposed_Places_encoder.fit_transform(dataset['Family working in Public Exposed Places'])
pickle.dump(Family_working_in_Public_Exposed_Places_encoder, open('Family_working_in_Public_Exposed_Places_encoder.pkl','wb'))
Wearing_Masks_encoder=LabelEncoder()
dataset['Wearing Masks'] = Wearing_Masks_encoder.fit_transform(dataset['Wearing Masks'])
pickle.dump(Wearing_Masks_encoder, open('Wearing_Masks_encoder.pkl','wb'))
Sanitization_from_Market_encoder=LabelEncoder()
dataset['Sanitization from Market'] = Sanitization_from_Market_encoder.fit_transform(dataset['Sanitization from Market'])
pickle.dump(Sanitization_from_Market_encoder, open('Sanitization_from_Market_encoder.pkl','wb'))
COVID_19_encoder=LabelEncoder()
dataset['COVID-19'] = COVID_19_encoder.fit_transform(dataset['COVID-19'])
pickle.dump(COVID_19_encoder, open('COVID_19_encoder.pkl','wb'))
Dry_Cough_encoder=LabelEncoder()
dataset['Dry Cough'] = Dry_Cough_encoder.fit_transform(dataset['Dry Cough'])
pickle.dump(Dry_Cough_encoder, open('Dry_Cough_encoder.pkl','wb'))
Sore_throat_encoder=LabelEncoder()
dataset['Sore throat'] = Sore_throat_encoder.fit_transform(dataset['Sore throat'])
pickle.dump(Sore_throat_encoder, open('Sore_throat_encoder.pkl','wb'))
Gastrointestinal_encoder=LabelEncoder()
dataset['Gastrointestinal '] = Gastrointestinal_encoder.fit_transform(dataset['Gastrointestinal '])
pickle.dump(Gastrointestinal_encoder, open('Gastrointestinal_encoder.pkl','wb'))
Fatigue_encoder=LabelEncoder()
dataset['Fatigue '] = Fatigue_encoder.fit_transform(dataset['Fatigue '])
pickle.dump(Fatigue_encoder, open('Fatigue_encoder.pkl','wb'))
Checking The Information Of The Dataset
dataset.info()
5. Checking For Null - Values
dataset.isnull().sum()
We have see that in this dataset no null present. the dataset is clean
Removing The Columns
dataset=dataset.drop('Running Nose',axis=1)
dataset=dataset.drop('Chronic Lung Disease',axis=1)
dataset=dataset.drop('Headache',axis=1)
dataset=dataset.drop('Heart Disease',axis=1)
dataset=dataset.drop('Diabetes',axis=1)
dataset=dataset.drop('Gastrointestinal ',axis=1)
dataset=dataset.drop('Wearing Masks',axis=1)
dataset=dataset.drop('Sanitization from Market',axis=1)
dataset=dataset.drop('Asthma',axis=1)
dataset.head()
In these we have removed the columns which does not add any value for the predictions
dataset = dataset.astype('category')
dataset.info()
In these we have changed the category of the columns
6. Split The Dataset
X = dataset.iloc[:, :11]
y = dataset['COVID-19']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
7. Standard Scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In these we have standardize the dataset
8. Model Building
a) Logistic Regression
from sklearn.linear_model import LogisticRegression
l1 = LogisticRegression()
l1.fit(X_train,y_train)
l1_predict = l1.predict(X_test)
cm = confusion_matrix(y_test, l1_predict)
print(cm)
print(classification_report(y_test,l1_predict))
acc_logreg = accuracy_score(y_test, l1_predict)
b) Decision Tree
from sklearn.tree import DecisionTreeClassifier
d1 = DecisionTreeClassifier()
d1.fit(X_train,y_train)
print(d1.feature_importances_)
feat_importances = pd.Series(d1.feature_importances_, index=X.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()
d1_predict = d1.predict(X_test)
cm = confusion_matrix(y_test, d1_predict)
print(cm)
print(classification_report(y_test,d1_predict))
acc_decision_tree = accuracy_score(y_test, d1_predict)
c) Random Forest
from sklearn.ensemble import RandomForestClassifier
r1 = RandomForestClassifier()
r1.fit(X_train,y_train)
r1_predict = r1.predict(X_test)
cm = confusion_matrix(y_test, r1_predict)
print(cm)
print(classification_report(y_test,r1_predict))
acc_random_forest = accuracy_score(y_test, r1_predict)
d) KNN
from sklearn.neighbors import KNeighborsClassifier
k1 = KNeighborsClassifier(n_neighbors=10)
k1.fit(X_train,y_train)
k1_predict = k1.predict(X_test)
cm = confusion_matrix(y_test,k1_predict)
print(cm)
print(classification_report(y_test,k1_predict))
acc_knn = accuracy_score(y_test, k1_predict)
e) Support Vector Machine
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
covid_model = model.predict(X_test)
cm = confusion_matrix(y_test, covid_model)
print(cm)
print(classification_report(y_test,covid_model))
acc_svc = accuracy_score(y_test, covid_model)
models = pd.DataFrame({
'Model': ['Logistic Regression','Decision Tree','Random Forest','KNN','Support Vector Machines'],
'Score': [acc_logreg, acc_decision_tree, acc_random_forest, acc_knn , acc_svc]})
models.sort_values(by='Score', ascending=False)
plt.rcParams['figure.figsize']=15,6
sns.set_style("darkgrid")
ax = sns.barplot(x=models.Model, y=models.Score, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()
From this plot we have see that Support Vector Machine got Higher accuracy as compared to the other models so we can use support vector machine for prediction of the model.