Dataset Link : https://www.kaggle.com/johnjdavisiv/urinary-biomarkers-for-pancreatic-cancer
The key features are four urinary biomarkers: creatinine, LYVE1, REG1B, and TFF1.
Age and sex, both included in the dataset, may also play a role in who gets pancreatic cancer. The dataset includes a few other biomarkers as well, but these were not measured in all patients (they were collected partly to measure how various blood biomarkers compared to urine biomarkers).
Lets look code.
Import Data :
cancer_df = pd.read_csv('data.csv')
cancer_df.head()
# Shape of our dataset
cancer_df.shape
plt.figure(figsize=(8,7))
sns.heatmap(cancer_df.corr(),annot=True)
cancer_df.drop(['sample_id','sample_origin','benign_sample_diagnosis','stage','patient_cohort'],axis=1,inplace=True)
cancer_df.head()
cancer_df.isnull().sum()
# 1 - no pancreatic disease
# 2 - benign
# 3 - pancreatic cancer
plt.figure(figsize=(8,5))
sns.countplot(x='diagnosis',data=cancer_df,palette='Set2');
plt.title("Countplot for Target variable",{'fontsize':20});
sns.countplot(x='sex',data=cancer_df,hue='diagnosis',palette='Set1');
plt.title("Countplot for sex variable based on dignosis",{'fontsize':20});
fig, axes = plt.subplots(2, 3, figsize=(12, 8), sharey=True)
sns.boxplot(ax=axes[0][0],x="plasma_CA19_9", data=cancer_df,palette='dark')
sns.boxplot(ax=axes[0][1],x="creatinine", data=cancer_df,palette='dark')
sns.boxplot(ax=axes[0][2],x="LYVE1", data=cancer_df,palette='dark')
sns.boxplot(ax=axes[1][0],x="REG1B", data=cancer_df,palette='dark')
sns.boxplot(ax=axes[1][1],x="TFF1", data=cancer_df,palette='dark')
sns.boxplot(ax=axes[1][2],x="REG1A", data=cancer_df,palette='dark');
sns.boxplot(x='age',data=cancer_df,palette='bright');
plt.title("Distibution of feature age",{'fontsize':20})
X = cancer_df.drop('diagnosis',axis=1)
y = cancer_df.diagnosis
for i in X.columns:
if X[i].dtype == 'O':
print(f"{i} : ",X[i].unique())
encoder1=LabelEncoder()
X.sex = encoder1.fit_transform(X.sex)
def fun(df):
if df==1:
return 0
if df==2:
return 1
else:
return 2
y = y.apply(fun)
Split data int two subsets: training data and testing data :
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=11)
print(X_train.shape,X_test.shape)
models = list()
accuracy = list()
Hyperparameter Tuning :
1] Random Forest :
rf = RandomForestClassifier(random_state=43)
param_grid = {
'n_estimators': [200, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [4,5,6,7,8],
'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_
new_rf = RandomForestClassifier(criterion='entropy',max_depth=8,n_estimators=240)
new_rf.fit(X_train,y_train)
models.append("Random Forest")
accuracy.append(abs(new_rf.score(X_test,y_test)))
Classification Report :
ypred1 = new_rf.predict(X_test)
print("Classification Report for Random Forest")
print(classification_report(y_test,ypred1))
2] Support Vector Machine :
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train,y_train)
grid.best_params_
#Output:{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
s = SVC(C=10,gamma=0.0001,kernel='rbf')
s.fit(X_train,y_train)
models.append("SVM")
accuracy.append(abs(s.score(X_test,y_test)))
ypred2 = s.predict(X_test)
print("Classification Report SVM")
print(classification_report(y_test,ypred2))
3] Logistic Regression :
logreg=LogisticRegression(max_iter=3500,penalty='l2')
logreg.fit(X_train,y_train)
models.append("Logistic Regression")
accuracy.append(abs(logreg.score(X_test,y_test)))
ypred3 = logreg.predict(X_test)
print("Classification Report Logistic Regression")
print(classification_report(y_test,ypred3))
4] Dicision Tree :
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
models.append("Decision Tree")
accuracy.append(abs(dt.score(X_test,y_test)))
ypred4 = dt.predict(X_test)
print("Classification Report Dicision Tree")
print(classification_report(y_test,ypred4))
Accuracy of Models :
plt.figure(figsize=(8,6))
plt.bar(models,accuracy,color=['red','deepskyblue','orange','blue']);
plt.title("Accuracy of models",{'fontsize':20})
Thank You ):