import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import csv
#loading the data
medicalinsurance = pd.read_csv('/content/insurance.csv')
Now let us check the first five rows of the dataset by using the head function.
#checking first 5 rows of data
medicalinsurance.head()
#finding number of rows and columns
medicalinsurance.shape
the next step is to get some information about the dataset.
#getting info about the dataset
medicalinsurance.info()
In this dataset we have 3 categorical features which are sex, smoker and region.
Now let us check for any missing values present in the dataset.
#checking missing values
medicalinsurance.isnull().sum()
Let's now analyze the data by using some plots by getting the statistical measures of the dataset.
#statistical measures of data
medicalinsurance.describe()
Now we are going to find the distribution of age value
#distribution of age
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(medicalinsurance['age'])
plt.title('Age Distribution')
plt.show()
Now let us do it for the gender column.
#distribution of gender
plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=medicalinsurance)
plt.title('Sex Distribution')
plt.show()
medicalinsurance['sex'].value_counts()
#distribution of bmi column
plt.figure(figsize=(6,6))
sns.distplot(medicalinsurance['bmi'])
plt.title('BMI Distribution')
plt.show()
The normal BMI range for a person lies between 18.5 and 24.9 while looking from the above figure most of people are overweight which can affect insurance costs.
Now let us do the distribution for the children column.
#distribution of children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=medicalinsurance)
plt.title('Children')
plt.show()
medicalinsurance['children'].value_counts()
Now let us do it for the smoker column.
#distribution of smoker column
plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=medicalinsurance)
plt.title('smoker')
plt.show()
medicalinsurance['smoker'].value_counts()
#distribution of region column
plt.figure(figsize=(6,6))
sns.countplot(x='region', data=medicalinsurance)
plt.title('region')
plt.show()
medicalinsurance['region'].value_counts()
#distribution of charges
plt.figure(figsize=(6,6))
sns.distplot(medicalinsurance['charges'])
plt.title('Charges Distribution')
plt.show()
Data Pre-Processing
As we can see in the dataset we have 3 categorical columns which are in text form so let us convert it into numerical values. We will encode the categorical values.
# encoding sex column
medicalinsurance.replace({'sex':{'male':0,'female':1}}, inplace=True)
# encoding 'smoker' column
medicalinsurance.replace({'smoker':{'yes':0,'no':1}}, inplace=True)
# encoding 'region' column
medicalinsurance.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)
Let us now split the features and target
X = medicalinsurance.drop(columns='charges', axis=1)
Y = medicalinsurance['charges']
print(X)
print(Y)
Splitting the data into Training data & Testing Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)
Let us now train the model
# loading the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Now let us evaluate the model
# prediction on training data
trainingprediction =regressor.predict(X_train)
# R squared value
rtrain = metrics.r2_score(Y_train, trainingprediction)
print('R squared vale : ', rtrain)
# prediction on test data
testprediction =regressor.predict(X_test)
# R squared value
rtest = metrics.r2_score(Y_test, testprediction)
print('R squared vale : ', rtest)
Building a Predictive System
inputdata = [31,1,25.74,0,1,0]
#changing inputdata to numpy array
inputarray = np.asarray(inputdata)
#reshaping the array
inputreshaped = inputarray.reshape(1,-1)
prediction = regressor.predict(inputreshaped)
print(prediction)
print('The insurance cost is USD',prediction[0])
Now Lets save this model by importing pickle library.
import pickle
pickle.dump(regressor,open('/content/model.pkl','wb'))
Conclusion
In this usecase, we have explored the basics of the linear regression model and applied it to predict charges and seen the correlation between predicted and actual results.