Data Analysis and Visualization MPA-2
NITHIN RAJ
KISHORE KUMAR M
VISHNU VARADHAN REDDY
Define the necessary libraries (1 mark)
In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn import metrics as mt
from sklearn.preprocessing import OrdinalEncoder,StandardScaler,MinMaxScaler,MaxAbsScaler,MaxAbsScaler,RobustScaler,No
from sklearn.model_selection import train_test_split
Load the dataset into the dataframe (1 mark)
In [2]: df = pd.read_csv('BigmartSales.csv')
Drop the "Item_Identifier" and "Outlet_Identifier" columns (1 mark)
In [3]: # df.drop() drops the data in the dataframe-df.
# By Default it drops the row data.
# To drop the column data set the axis=1
In [4]: print('Columns in the dataset before dropping are: ',df.columns)
df = df.drop(['Item_Identifier','Outlet_Identifier'],axis=1)
print('Columns in the dataset after dropping are: ',df.columns)
Columns in the dataset before dropping are: Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visib
ility',
'Item_Type', 'Item_MRP', 'Outlet_Identifier',
'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
'Outlet_Type', 'Item_Outlet_Sales'],
dtype='object')
Columns in the dataset after dropping are: Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales'],
dtype='object')
Extract the target labels (1 mark)
In [5]: # The target label in a dataset is the output data.
# It changes based on the other features in the dataset
In [6]: target_label = df.Item_Outlet_Sales
target_label
Out[6]: 0 3735.1380
1 443.4228
2 2097.2700
3 732.3800
4 994.7052
...
8518 2778.3834
8519 549.2850
8520 1193.1136
8521 1845.5976
8522 765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64
Replace the field "Item_Fat_Content" with numerical value (1 mark)
In [7]: # df.replace({'old_data':'new_data'}) replaces the old_data in a dataframe with the new_data provided as the key-value
In [8]: print('Before Replacing: ',df['Item_Fat_Content'].unique())
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'Low Fat':0,'LF':0,'Regular':1,'reg':1,'low fat':0})
print('After Replacing: ',df['Item_Fat_Content'].unique())
Before Replacing: ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
After Replacing: [0 1]
Perform ordinal encoding of the "Item_Type", "Outlet_Type", "Outlet_Location_Type" and "Outlet_Type" field (1 mark)
In [9]: # Encoding is the process of transforming the categorical (discrete) features into ordinal integers.
# This is the preprocessing step to be done before using the dataset for ML model training
In [10]: ordEnc = OrdinalEncoder()
df['Item_Type'] = ordEnc.fit_transform(df['Item_Type'].values.reshape(-1, 1))
df['Item_Type']
Out[10]: 0 4.0
1 14.0
2 10.0
3 6.0
4 9.0
...
8518 13.0
8519 0.0
8520 8.0
8521 13.0
8522 14.0
Name: Item_Type, Length: 8523, dtype: float64
In [11]: df['Outlet_Type'] = ordEnc.fit_transform(df['Outlet_Type'].values.reshape(-1, 1))
df['Outlet_Type']
Out[11]: 0 1.0
1 2.0
2 1.0
3 0.0
4 1.0
...
8518 1.0
8519 1.0
8520 1.0
8521 2.0
8522 1.0
Name: Outlet_Type, Length: 8523, dtype: float64
In [12]: df['Outlet_Location_Type'] = ordEnc.fit_transform(df['Outlet_Location_Type'].values.reshape(-1, 1))
df['Outlet_Location_Type']
Out[12]: 0 0.0
1 2.0
2 0.0
3 2.0
4 2.0
...
8518 2.0
8519 1.0
8520 1.0
8521 2.0
8522 0.0
Name: Outlet_Location_Type, Length: 8523, dtype: float64
In [13]: df.isna().sum()
Out[13]: Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Establishment_Year 0
Outlet_Size 2410
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
Imputation of "Outlet_Size" field with mode value (1 mark)
In [14]: # fillna() is the method used to place the custom values at the NaN in a dataframe of series
In [15]: print('The Mode of Outlet Size is: ',df['Outlet_Size'].mode())
The Mode of Outlet Size is: 0 Medium
Name: Outlet_Size, dtype: object
In [16]: df['Outlet_Size'] = df['Outlet_Size'].fillna('Medium')
In [17]: df.isna().sum()
Out[17]: Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Establishment_Year 0
Outlet_Size 0
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
Check for null values (1 mark)
In [18]: df.isnull().sum()
Out[18]: Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Establishment_Year 0
Outlet_Size 0
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
Imputation of "Item_Weight" field with mode value (1 mark)
In [19]: print('The Mode of Item Weight is: ',df['Item_Weight'].mode())
The Mode of Item Weight is: 0 12.15
Name: Item_Weight, dtype: float64
In [20]: df['Item_Weight'] = df['Item_Weight'].fillna(12.15)
In [21]: df.isna().sum()
Out[21]: Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Establishment_Year 0
Outlet_Size 0
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
Display all field in the dataset using boxplot (1 mark)
In [22]: # Box plot is used to find the outliers present in a data set. Mostly used for a univariate analysis.
# Also can be applied to bivariate analysis having 1 numerical and 1 categorical data
# It is called as grouped boxplot
In [23]: plt.figure(figsize=(10,5))
sns.boxplot(df)
plt.xticks(rotation=90)
plt.title('Bigmart Sales Data')
plt.show()
Split the dataset into train and test(20%), apply Linear Regression and calculate RMSE value (1 mark)
In [24]: # train_test_split is the method in sklearn.model_selection
# It is used to create the training and testing data from a complete data
# It gets the parameters - input data, output data,
# test_size=the size of the data that has to be selected for the testing of the ML model
# it returns four values - xtrain,xtest,ytrain,ytest that are given to the ML model for training and testing
In [25]: df['Outlet_Size'] = ordEnc.fit_transform(df['Outlet_Size'].values.reshape(-1, 1))
X=df.drop('Item_Outlet_Sales',axis=1)
Y=df['Item_Outlet_Sales']
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.2,)
# Create and fit the linear regression model
model = LinearRegression()
model.fit(xtrain, ytrain)
# Make predictions on the test set
ypred = model.predict(xtest)
# Calculate RMSE
rmse1 = math.sqrt(mt.mean_squared_error(ytest, ypred))
print(f"Root Mean Squared Error (RMSE): {rmse1}")
Root Mean Squared Error (RMSE): 1177.361349688933
Apply StandardScaller and split the dataset into train and test(20%) (1 mark)
In [26]: # StandardScaler standardize features by removing the mean and scaling to unit variance.
# Standardization of a dataset is a common requirement for many machine learning estimators:
# they might behave badly if the individual features do not more or less look like standard normally distributed data
In [27]: sc = StandardScaler()
df_sc = sc.fit_transform(df)
df1 = pd.DataFrame(df_sc)
df1.columns=['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
X1=df1.drop('Item_Outlet_Sales',axis=1)
Y1=df1['Item_Outlet_Sales']
x1train,x1test,y1train,y1test=train_test_split(X1,Y,test_size=0.2)
# Create and fit the linear regression model
model1 = LinearRegression()
model1.fit(x1train, y1train)
Out[27]: LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Display all field in the dataset using boxplot (1 mark)
In [28]: plt.figure(figsize=(10,5))
sns.boxplot(df1)
plt.xticks(rotation=90)
plt.title('Bigmart Sales Data')
plt.show()
Apply Linear Regression and calculate RMSE value (1 mark)
In [29]: # Make predictions on the test set
y1pred = model1.predict(x1test)
# Calculate RMSE
rmse2 = math.sqrt(mt.mean_squared_error(y1test, y1pred))
print(f"Root Mean Squared Error (RMSE): {rmse2}")
Root Mean Squared Error (RMSE): 1161.6406081768139
Apply MinMaxScaler, split the dataset into train and test(20%), apply LinearRegression and calculate RMSE (1 mark)
In [30]: # MinMaxScaler Transform features by scaling each feature to a given range.
# This estimator scales and translates each feature individually such that
# it is in the given range on the training set, e.g. between zero and one.
# This transformation is often used as an alternative to zero mean, unit variance scaling.
In [31]: mmsc = MinMaxScaler()
df_mmsc = mmsc.fit_transform(df)
df2 = pd.DataFrame(df_mmsc)
df2.columns=['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
X2=df2.drop('Item_Outlet_Sales',axis=1)
Y2=df2['Item_Outlet_Sales']
x2train,x2test,y2train,y2test=train_test_split(X2,Y,test_size=0.2)
# Create and fit the linear regression model
model2 = LinearRegression()
model2.fit(x2train, y2train)
# Make predictions on the test set
y2pred = model2.predict(x2test)
# Calculate RMSE
rmse3 = math.sqrt(mt.mean_squared_error(y2test, y2pred))
print(f"Root Mean Squared Error (RMSE): {rmse3}")
Root Mean Squared Error (RMSE): 1176.5289257439433
Apply RobustScaler,Split the dataset into train and test(20%), apply LinearRegression and calculate RMSE (1 mark)
In [32]: # RobustScaler scales features using statistics that are robust to outliers.
# This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile R
# The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).
In [33]: rsc = RobustScaler()
df_rsc = rsc.fit_transform(df)
dfr = pd.DataFrame(df_rsc)
dfr.columns=['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
Xr=dfr.drop('Item_Outlet_Sales',axis=1)
Yr=dfr['Item_Outlet_Sales']
xrtrain,xrtest,yrtrain,yrtest=train_test_split(Xr,Y,test_size=0.2)
# Create and fit the linear regression model
modelr = LinearRegression()
modelr.fit(xrtrain, yrtrain)
# Make predictions on the test set
yrpred = modelr.predict(xrtest)
# Calculate RMSE
rmse4 = math.sqrt(mt.mean_squared_error(yrtest, yrpred))
print(f"Root Mean Squared Error (RMSE): {rmse4}")
Root Mean Squared Error (RMSE): 1143.8487793222237
Apply MaxAbsScaler, split the dataset into train and test(20%), apply LinearRegression and calculate RMSE (1 mark)
In [34]: # MaxAbsScaler scales each feature by its maximum absolute value.
# This estimator scales and translates each feature individually such that
# the maximal absolute value of each feature in the training set will be 1.0.
# It does not shift/center the data, and thus does not destroy any sparsity.
# MaxAbsScaler doesn’t reduce the effect of outliers; it only linearily scales them down.
In [35]: masc = MaxAbsScaler()
df_masc = masc.fit_transform(df)
dfa = pd.DataFrame(df_masc)
dfa.columns=['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
Xa=dfa.drop('Item_Outlet_Sales',axis=1)
Ya=dfa['Item_Outlet_Sales']
xatrain,xatest,yatrain,yatest=train_test_split(Xa,Y,test_size=0.2)
# Create and fit the linear regression model
modela = LinearRegression()
modela.fit(xatrain, yatrain)
# Make predictions on the test set
yapred = modela.predict(xatest)
# Calculate RMSE
rmse5 = math.sqrt(mt.mean_squared_error(yatest, yapred))
print(f"Root Mean Squared Error (RMSE): {rmse5}")
Root Mean Squared Error (RMSE): 1195.9232136536114
Apply Normalizer, split the dataset into train and test(20%), apply LinearRegression and calculate RMSE (1 mark)
In [36]: # Normalizer normalizes samples individually to unit norm.
# Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of oth
# so that its norm (l1, l2 or inf) equals one.
In [37]: nsc = Normalizer()
df_nsc = masc.fit_transform(df)
dfn = pd.DataFrame(df_nsc)
dfn.columns=['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size',
'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
Xn=dfn.drop('Item_Outlet_Sales',axis=1)
Yn=dfn['Item_Outlet_Sales']
xntrain,xntest,yntrain,yntest=train_test_split(Xn,Y,test_size=0.2)
# Create and fit the linear regression model
modeln = LinearRegression()
modeln.fit(xntrain, yntrain)
# Make predictions on the test set
ynpred = modeln.predict(xntest)
# Calculate RMSE
rmse6 = math.sqrt(mt.mean_squared_error(yntest, ynpred))
print(f"Root Mean Squared Error (RMSE): {rmse6}")
Root Mean Squared Error (RMSE): 1218.0003678085768
Define a function valuelabel to place the legend of each bar in the histogram (1 mark)
In [38]: def valuelabel(ax, spacing=3):
# For each bar: Place a label
for rect in ax.patches:
# Get X and Y placement of label from rect.
y_value = rect.get_height()
x_value = rect.get_x() + rect.get_width() / 2
# Number of points between bar and label
space = spacing
# Vertical alignment for positive values
va = 'bottom'
# If value of bar is negative: Place label below bar
if y_value < 0:
# Invert space to place label below
space *= -1
# Vertically align label at top
va = 'top'
# Use Y value as label and format number with one decimal place
label = "{:.1f}".format(y_value)
# Create annotation
ax.annotate(
label, # Use `label` as label
(x_value, y_value), # Place label at end of the bar
xytext=(0, space), # Vertically shift label by `space`
textcoords="offset points", # Interpret `xytext` as offset in points
ha='center', # Horizontally center label
va=va) # Vertically align label differently for
# positive and negative values.
Plot a histogram to display the RMSE value of each scaler (1 mark)
In [39]: rmses = [rmse1,rmse2,rmse3,rmse4,rmse5,rmse6]
rmse_Series = pd.Series(rmses)
labels = ['rmse1','rmse2','rmse3','rmse4','rmse5','rmse6']
# Creating histogram
plt.figure(figsize=(10,5))
ax = rmse_Series.plot(kind='bar')
ax.set_xticklabels(labels)
valuelabel(ax)
# Show plot
plt.show()