import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
import joblib
# Importing dataset
df = pd.read_excel('superstore_sales.xlsx')
DATA AUDIT
You can’t make your data work for you until you know what data you’re talking about.
To get a quick idea of what the data looks like, we can call the head function on the data frame. By default, this returns the top five rows, but it can take in a parameter of how many rows to return.
Let us check the first five rows of the dataset using the head function.
# first 5 rows of the dataframe
print(salesdata.head())
#last 5 rows of data
salesdata.tail
# number of data points & number of features
salesdata.shape
In the next step, we are going to check the columns present in the dataset.
# getting some information about thye dataset
salesdata.info()
Now we can do further analysis on our data to answer our questions. Before that, we should see if there are any missing values in our data set.To check if there are any missing values in the entire data set we use the isnull function, then see if there are any values.
We’re lucky we have such a nice data set and with no missing values. While we won’t focus on it in this post, a data scientist will spend their time cleaning (or wrangling ) the data. Since we don’t have any missing data, we can start doing further analysis on our data.
# checking for missing values
salesdata.isnull().sum()
Now we can do further analysis on our data to answer our questions. Before that, we should see if there are any missing values in our data set.To check if there are any missing values in the entire data set we use the isnull function, then see if there are any values.
We’re lucky we have such a nice data set and with no missing values. While we won’t focus on it in this post, a data scientist will spend their time cleaning (or wrangling ) the data. Since we don’t have any missing data, we can start doing further analysis on our data.
# mean value of "Item_Weight" column
salesdata['Item_Weight'].mean()
# filling the missing values in "Item_weight column" with "Mean" value
salesdata['Item_Weight'].fillna(salesdata['Item_Weight'].mean(), inplace=True)
Let us now replace the missing values in "outlet_size" with mode.
# mode of "Outlet_Size" column
salesdata['Outlet_Size'].mode()
# filling the missing values in "Outlet_Size" column with Mode
outlet_size_mode = salesdata.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
print(outlet_size_mode)
Let us now create another variable as missing_values and check whether any values are missing.
missing_values = salesdata['Outlet_Size'].isnull()
print(missing_values)
Let us now replace the missing values in outlet_size column
salesdata.loc[missing_values, 'Outlet_Size'] = salesdata.loc[missing_values,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])
salesdata.head()
Now we need to change low fat and lf to Low fat and reg to Regular
salesdata['Item_Fat_Content'].value_counts()
salesdata.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)
let's check whether the items are merged
salesdata['Item_Fat_Content'].value_counts()
Now let us do encoding by changing all categorical values to numerical values
print("before",salesdata.head())
d={}
#label encoding
encoder = LabelEncoder()
salesdata['Item_Identifier'] = encoder.fit_transform(salesdata['Item_Identifier'])
d['Item_Identifier']=encoder.classes_
salesdata['Item_Fat_Content'] = encoder.fit_transform(salesdata['Item_Fat_Content'])
d['Item_Fat_Content']=encoder.classes_
salesdata['Item_Type'] = encoder.fit_transform(salesdata['Item_Type'])
d['Item_Type']=encoder.classes_
salesdata['Outlet_Identifier'] = encoder.fit_transform(salesdata['Outlet_Identifier'])
d['Outlet_Identifier']=encoder.classes_
salesdata['Outlet_Size'] = encoder.fit_transform(salesdata['Outlet_Size'])
d['Outlet_Size']=encoder.classes_
salesdata['Outlet_Location_Type'] = encoder.fit_transform(salesdata['Outlet_Location_Type'])
d['Outlet_Location_Type']=encoder.classes_
salesdata['Outlet_Type'] = encoder.fit_transform(salesdata['Outlet_Type'])
d['Outlet_Type']=encoder.classes_
Now let us save the total encoding process in a model file named enc.sav
# np.save('classes.npy', encoder.classes_)
joblib.dump(d,"enc.sav" )
print("after",salesdata.head())
Now let us split the data into target and features by taking two variables X and Y. X contain all the feature columns and Y contains the target column.
#Splitting features and Target
X = salesdata.drop(columns='Item_Outlet_Sales', axis=1)
Y = salesdata['Item_Outlet_Sales']
Let us check the data
print(X)
Now check the other variable Y
print(Y)
Now let us split the data into training and testing data
#Splitting the data into Training data & Testing Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)
Now let us train our machine learning model and evaluate it.
#model training
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)
Now the model is built and in the next step let us predict on training data
# prediction on training data
training_data_prediction = regressor.predict(X_train)
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared value = ', r2_train)
let us now predict on testing data
# prediction on test data
test_data_prediction = regressor.predict(X_test)
print(test_data_prediction)