import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df1 = pd.read_csv("Datasets/bigmart.csv")
df1.head()
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility \
0 FDA15 9.30 Low Fat 0.016047
1 DRC01 5.92 Regular 0.019278
2 FDN15 17.50 Low Fat 0.016760
3 FDX07 19.20 Regular 0.000000
4 NCD19 8.93 Low Fat 0.000000
Item_Type Item_MRP Outlet_Identifier \
0 Dairy 249.8092 OUT049
1 Soft Drinks 48.2692 OUT018
2 Meat 141.6180 OUT049
3 Fruits and Vegetables 182.0950 OUT010
4 Household 53.8614 OUT013
Outlet_Establishment_Year Outlet_Size Outlet_Location_Type \
0 1999 Medium Tier 1
1 2009 Medium Tier 3
2 1999 Medium Tier 1
3 1998 NaN Tier 3
4 1987 High Tier 3
Outlet_Type Item_Outlet_Sales
0 Supermarket Type1 3735.1380
1 Supermarket Type2 443.4228
2 Supermarket Type1 2097.2700
3 Grocery Store 732.3800
4 Supermarket Type1 994.7052
df1.shape
(8523, 12)
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Item_Identifier 8523 non-null object
1 Item_Weight 7060 non-null float64
2 Item_Fat_Content 8523 non-null object
3 Item_Visibility 8523 non-null float64
4 Item_Type 8523 non-null object
5 Item_MRP 8523 non-null float64
6 Outlet_Identifier 8523 non-null object
7 Outlet_Establishment_Year 8523 non-null int64
8 Outlet_Size 6113 non-null object
9 Outlet_Location_Type 8523 non-null object
10 Outlet_Type 8523 non-null object
11 Item_Outlet_Sales 8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
df1.isnull().sum()
Item_Identifier 0
Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Identifier 0
Outlet_Establishment_Year 0
Outlet_Size 2410
Outlet_Location_Type 0
Outlet_Type 0
Item_Outlet_Sales 0
dtype: int64
df1.isnull().sum() / df1.shape[0] * 100
Item_Identifier 0.000000
Item_Weight 17.165317
Item_Fat_Content 0.000000
Item_Visibility 0.000000
Item_Type 0.000000
Item_MRP 0.000000
Outlet_Identifier 0.000000
Outlet_Establishment_Year 0.000000
Outlet_Size 28.276428
Outlet_Location_Type 0.000000
Outlet_Type 0.000000
Item_Outlet_Sales 0.000000
dtype: float64
df2 = df1.drop(df1[["Item_Identifier", "Outlet_Identifier",
"Outlet_Establishment_Year", "Outlet_Type"]], axis = 1)
df2.head()
Item_Weight Item_Fat_Content Item_Visibility
Item_Type \
0 9.30 Low Fat 0.016047
Dairy
1 5.92 Regular 0.019278 Soft
Drinks
2 17.50 Low Fat 0.016760
Meat
3 19.20 Regular 0.000000 Fruits and
Vegetables
4 8.93 Low Fat 0.000000
Household
Item_MRP Outlet_Size Outlet_Location_Type Item_Outlet_Sales
0 249.8092 Medium Tier 1 3735.1380
1 48.2692 Medium Tier 3 443.4228
2 141.6180 Medium Tier 1 2097.2700
3 182.0950 NaN Tier 3 732.3800
4 53.8614 High Tier 3 994.7052
df2.isnull().sum()
Item_Weight 1463
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Size 2410
Outlet_Location_Type 0
Item_Outlet_Sales 0
dtype: int64
df2.isnull().sum() / df2.shape[0] * 100
Item_Weight 17.165317
Item_Fat_Content 0.000000
Item_Visibility 0.000000
Item_Type 0.000000
Item_MRP 0.000000
Outlet_Size 28.276428
Outlet_Location_Type 0.000000
Item_Outlet_Sales 0.000000
dtype: float64
sns.boxplot(df2["Item_Weight"])
<Axes: >
df2.Item_Weight = df2.Item_Weight.fillna(df2.Item_Weight.mean())
df2.head()
Item_Weight Item_Fat_Content Item_Visibility
Item_Type \
0 9.30 Low Fat 0.016047
Dairy
1 5.92 Regular 0.019278 Soft
Drinks
2 17.50 Low Fat 0.016760
Meat
3 19.20 Regular 0.000000 Fruits and
Vegetables
4 8.93 Low Fat 0.000000
Household
Item_MRP Outlet_Size Outlet_Location_Type Item_Outlet_Sales
0 249.8092 Medium Tier 1 3735.1380
1 48.2692 Medium Tier 3 443.4228
2 141.6180 Medium Tier 1 2097.2700
3 182.0950 NaN Tier 3 732.3800
4 53.8614 High Tier 3 994.7052
df2.isnull().sum()
Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Size 2410
Outlet_Location_Type 0
Item_Outlet_Sales 0
dtype: int64
df2["Outlet_Size"].value_counts()
Outlet_Size
Medium 2793
Small 2388
High 932
Name: count, dtype: int64
df2["Outlet_Size"] =
df2["Outlet_Size"].fillna(df2["Outlet_Size"].mode()[0])
df2.head()
Item_Weight Item_Fat_Content Item_Visibility
Item_Type \
0 9.30 Low Fat 0.016047
Dairy
1 5.92 Regular 0.019278 Soft
Drinks
2 17.50 Low Fat 0.016760
Meat
3 19.20 Regular 0.000000 Fruits and
Vegetables
4 8.93 Low Fat 0.000000
Household
Item_MRP Outlet_Size Outlet_Location_Type Item_Outlet_Sales
0 249.8092 Medium Tier 1 3735.1380
1 48.2692 Medium Tier 3 443.4228
2 141.6180 Medium Tier 1 2097.2700
3 182.0950 Medium Tier 3 732.3800
4 53.8614 High Tier 3 994.7052
df2.isnull().sum()
Item_Weight 0
Item_Fat_Content 0
Item_Visibility 0
Item_Type 0
Item_MRP 0
Outlet_Size 0
Outlet_Location_Type 0
Item_Outlet_Sales 0
dtype: int64
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2["Item_Fat_Content"] = le.fit_transform(df2["Item_Fat_Content"])
df2["Outlet_Size"] = le.fit_transform(df2["Outlet_Size"])
df2["Outlet_Location_Type"] =
le.fit_transform(df2["Outlet_Location_Type"])
df2.head()
Item_Weight Item_Fat_Content Item_Visibility
Item_Type \
0 9.30 1 0.016047
Dairy
1 5.92 2 0.019278 Soft
Drinks
2 17.50 1 0.016760
Meat
3 19.20 2 0.000000 Fruits and
Vegetables
4 8.93 1 0.000000
Household
Item_MRP Outlet_Size Outlet_Location_Type Item_Outlet_Sales
0 249.8092 1 0 3735.1380
1 48.2692 1 2 443.4228
2 141.6180 1 0 2097.2700
3 182.0950 1 2 732.3800
4 53.8614 0 2 994.7052
df2["Item_Type"].value_counts()
Item_Type
Fruits and Vegetables 1232
Snack Foods 1200
Household 910
Frozen Foods 856
Dairy 682
Canned 649
Baking Goods 648
Health and Hygiene 520
Soft Drinks 445
Meat 425
Breads 251
Hard Drinks 214
Others 169
Starchy Foods 148
Breakfast 110
Seafood 64
Name: count, dtype: int64
import category_encoders as ce
BE = ce.BinaryEncoder(cols = ["Item_Type"])
X = BE.fit_transform(df2["Item_Type"])
df3 = pd.concat([df2, X], axis = 1)
df3 = df3.drop("Item_Type", axis = 1)
df3.head()
Item_Weight Item_Fat_Content Item_Visibility Item_MRP
Outlet_Size \
0 9.30 1 0.016047 249.8092
1
1 5.92 2 0.019278 48.2692
1
2 17.50 1 0.016760 141.6180
1
3 19.20 2 0.000000 182.0950
1
4 8.93 1 0.000000 53.8614
0
Outlet_Location_Type Item_Outlet_Sales Item_Type_0
Item_Type_1 \
0 0 3735.1380 0 0
1 2 443.4228 0 0
2 0 2097.2700 0 0
3 2 732.3800 0 0
4 2 994.7052 0 0
Item_Type_2 Item_Type_3 Item_Type_4
0 0 0 1
1 0 1 0
2 0 1 1
3 1 0 0
4 1 0 1