import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Importing the datasets
train = pd.read_csv("D://final project 4-1//New folder//bm_Train.csv")
test = pd.read_csv("D://final project 4-1//New folder//bm_Test.csv")
# making copies of train and test dataset
train = train.copy()
test = test.copy()
train.head()
train.describe()
test.describe()
# Checking the shape of the training and testing datasets
print(train.shape)
print(test.shape)
# combining the train and test dataset
data = pd.concat([train, test])
print(data.shape)
# Data Visualization and univariate data analysis
plt.hist(train['Item_Outlet_Sales'], bins = 20, color = 'pink')
plt.title('Target Variable')
plt.xlabel('Item Outlet Sales')
plt.ylabel('count')
plt.show()
# checking the columns of the train set
print(train.columns)
train.dtypes
# checking the different items in Item Idemtifier
train['Item_Identifier'].value_counts()
# we will analyze only the training set
train['Item_Identifier'].value_counts(normalize = True)
train['Item_Identifier'].value_counts().plot.hist()
plt.title('Different types of item available in the store')
plt.xlabel('Item Identifier')
plt.ylabel('Number of Items')
plt.legend()
plt.show()
# checking the different items in Item Fat Content
train['Item_Fat_Content'].value_counts()
# checking different varieties of item fat content
train['Item_Fat_Content'].value_counts(normalize = True)
train['Item_Fat_Content'].value_counts().plot.bar()
plt.title('Different varieties of fats in item in the store')
plt.xlabel('Fat')
plt.ylabel('Number of Items')
plt.show()
# checking the different items in Item Type
train['Item_Type'].value_counts()
# we will analyze only the training set
train['Item_Type'].value_counts(normalize = True)
train['Item_Type'].value_counts().plot.bar()
plt.title('Different types of item available in the store')
plt.xlabel('Item')
plt.ylabel('Number of Items')
plt.show()
# checking the different types of Outlet Identifier
train['Outlet_Identifier'].value_counts()
# we will analyze only the training set
train['Outlet_Identifier'].value_counts(normalize = True)
train['Outlet_Identifier'].value_counts().plot.bar()
plt.title('Different types of outlet identifier in the store')
plt.xlabel('Item')
plt.ylabel('Number of Items')
plt.show()
# checking the different types of Outlet Size
train['Outlet_Size'].value_counts()
# we will analyze only the training set
train['Outlet_Size'].value_counts(normalize = True)
train['Outlet_Size'].value_counts().plot.bar()
plt.title('Different types of outlet sizes in the store')
plt.xlabel('Item')
plt.ylabel('Number of Items')
plt.show()
# checking different types of items in Outlet Location Type
train['Outlet_Location_Type'].value_counts()
# we will analyze only the training set
train['Outlet_Location_Type'].value_counts(normalize = True)
train['Outlet_Location_Type'].value_counts().plot.bar()
plt.title('Different types of outlet location types in the store')
plt.xlabel('Item')
plt.ylabel('Number of Items')
plt.show()
# checking different types of item in Outlet Type
train['Outlet_Type'].value_counts()
# we will analyze only the training set
train['Outlet_Type'].value_counts(normalize = True)
train['Outlet_Type'].value_counts().plot.bar()
plt.title('Different types of outlet types in the store')
plt.xlabel('Item')
plt.ylabel('Number of Items')
plt.show()
# fat content vs outlet identifier
Item_Fat_Content = pd.crosstab(train['Item_Fat_Content'],train['Outlet_Identifier'])
Item_Fat_Content.div(Item_Fat_Content.sum(1).astype(float), axis=0).plot(kind="bar",
stacked=True, figsize=(11, 11))
# fat content vs item type
Item_Type = pd.crosstab(train['Item_Type'], train['Item_Fat_Content'])
Item_Type.div(Item_Type.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(13,
13))
# data pre processing
# checking unique values in the columns of train dataset
data.apply(lambda x: len(x.unique()))
data.isnull().sum()
# imputing missing values
data['Item_Weight'] = data['Item_Weight'].replace(0, np.NaN)
data['Item_Weight'] = data['Item_Weight'].fillna(data['Item_Weight'].mean())
data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])
data['Item_Outlet_Sales'] = data['Item_Outlet_Sales'].replace(0, np.NaN)
data['Item_Outlet_Sales'] = data['Item_Outlet_Sales'].fillna(data['Item_Outlet_Sales'].mode()[0])
data.isnull().sum()
# combining reg, Regular and Low Fat, low fat and, LF
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low
fat': 'Low Fat'})
data['Item_Fat_Content'].value_counts()
# Getting the first two characters of ID to separate them into different categories
data['Item_Identifier'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Identifier'] = data['Item_Identifier'].map({'FD':'Food', 'NC':'Non_Consumable',
'DR':'Drinks'})
data['Item_Identifier'].value_counts()
# determining the operation period of a time
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']
data['Outlet_Years'].value_counts()
# removing unnecessary columns from the dataset
#data = data.drop('Item_Identifier', axis = 1)
#print(data.shape)
data['Outlet_Type'].value_counts()
# label encoding
from sklearn.preprocessing import LabelEncoder
data.apply(LabelEncoder().fit_transform)
# one hot encoding
data = pd.get_dummies(data)
print(data.shape)
# splitting the data into dependent and independent variables
x = data.drop('Item_Outlet_Sales', axis = 1)
y = data.Item_Outlet_Sales
print(x.shape)
print(y.shape)
# splitting the dataset into train and test
train = data.iloc[:8523,:]
test = data.iloc[8523:,:]
print(train.shape)
print(test.shape)
# making x_train, x_test, y_train, y_test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
# Modelling
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model = LinearRegression()
model.fit(x_train, y_train)
# predicting the test set results
y_pred = model.predict(x_test)
print(y_pred)
# finding the mean squared error and variance
mse = mean_squared_error(y_test, y_pred)
print('RMSE :', np.sqrt(mse))
print('Variance score: %.2f' % r2_score(y_test, y_pred))
# AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
model= AdaBoostRegressor(n_estimators = 100)
model.fit(x_train, y_train)
# predicting the test set results
y_pred = model.predict(x_test)
# RMSE
mse = mean_squared_error(y_test, y_pred)
print("RMSE :", np.sqrt(mse))
# XgBoost Regressor
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(x_train, y_train)
# predicting the test set results
y_pred = model.predict(x_test)
print(y_pred)
# Calculating the root mean squared error
print("RMSE :", np.sqrt(((y_test - y_pred)**2).sum()/len(y_test)))
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100 , n_jobs = -1)
model.fit(x_train, y_train)
# predicting the test set results
y_pred = model.predict(x_test)
print(y_pred)
# finding the mean squared error and variance
mse = mean_squared_error(y_test, y_pred)
print("RMSE :",np.sqrt(mse))
print('Variance score: %.2f' % r2_score(y_test, y_pred))
print("Result :",model.score(x_train, y_train))
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
# predicting the test set results
y_pred = model.predict(x_test)
print(y_pred)
print(" RMSE : " , np.sqrt(((y_test - y_pred)**2).sum()/len(y_test)))
# Support vector machine
from sklearn.svm import SVR
model = SVR()
model.fit(x_train, y_train)
# predicting the x test results
y_pred = model.predict(x_test)
# Calculating the RMSE Score
mse = mean_squared_error(y_test, y_pred)
print("RMSE :", np.sqrt(mse))
# Neural Networks
import numpy as np
x_train = np.asmatrix(x_train)
x_test = np.asmatrix(x_test)
y_train = np.asmatrix(y_train.T)
y_test = np.asmatrix(y_test.T)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
import tensorflow as tf
D = x_train.shape[1]
# Creating the placeholders for storing the X and Y variables
tf_X = tf.placeholder(tf.float32 , [None,D])
tf_Y = tf.placeholder(tf.float32 , [None,1])
# Layer 1
W1 = tf.Variable(tf.random_normal([D, 20], stddev = 0.01))
b1 = tf.Variable(tf.zeros([20]))
Layer_1 = tf.nn.relu(tf.matmul(tf_X, W1) + b1)
# Layer 2
W2 = tf.Variable(tf.random_normal([20, 15], stddev = 0.01))
b2 = tf.Variable(tf.zeros([15]))
Layer_2 = tf.nn.relu(tf.matmul(Layer_1, W2) + b2)
# Layer 3
W3 = tf.Variable(tf.random_normal([15, 10], stddev = 0.01))
b3 = tf.Variable(tf.zeros([10]))
Layer_3 = tf.nn.relu(tf.matmul(Layer_2, W3) + b3)
# Output layer
W4 = tf.Variable(tf.random_normal([10, 1] , stddev = 0.01))
b4 = tf.Variable(tf.zeros([1]))
output = tf.add(tf.matmul(Layer_3, W4) , b4)
# Defining our cost function which we have to reduce
cost = tf.reduce_mean(tf.square(output - tf_Y))
# Defining the function for Gradient Descent
train = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
ctrain = []
ctest = []
for i in range(10000):
sess.run(train,feed_dict ={tf_X :x_train ,tf_Y :y_train})
ctrain.append(sess.run(cost, feed_dict={tf_X :x_train, tf_Y :y_train}))
OUTPUT :-
(8523, 12)
(5681, 11)
(14204, 12)
(14204, 47)
(14204, 46)
(14204,)
(8523, 47)
(5681, 47)
Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
'Item_Type', 'Item_MRP', 'Outlet_Identifier',
'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
'Outlet_Type', 'Item_Outlet_Sales'],
dtype='object')
Item_Identifier 0.000000
Item_Weight 17.165317
Item_Fat_Content 0.000000
Item_Visibility 0.000000
Item_Type 0.000000
Item_MRP 0.000000
Outlet_Identifier 0.000000
Outlet_Establishment_Year 0.000000
Outlet_Size 28.276428
Outlet_Location_Type 0.000000
Outlet_Type 0.000000
Item_Outlet_Sales 0.000000
dtype: float64
12.857645184135976 12.6
Original Weight variable variance 21.56168825983637
Item Weight variance after mean imputation 17.860121735060453
Item Weight variance after median imputation 17.869561454073366
[array(['DR', 'FD', 'NC'], dtype=object)]
[array(['LF', 'Regular'], dtype=object)]
[array(['Baking Goods', 'Breads', 'Breakfast', 'Canned', 'Dairy',
'Frozen Foods', 'Fruits and Vegetables', 'Hard Drinks',
'Health and Hygiene', 'Household', 'Meat', 'Others', 'Seafood',
'Snack Foods', 'Soft Drinks', 'Starchy Foods'], dtype=object)]
[array(['OUT010', 'OUT013', 'OUT017', 'OUT018', 'OUT019', 'OUT027',
'OUT035', 'OUT045', 'OUT046', 'OUT049'], dtype=object)]
[array(['High', 'Medium', 'Small'], dtype=object)]
[array(['Tier 1', 'Tier 2', 'Tier 3'], dtype=object)]
[array(['Grocery Store', 'Supermarket Type1', 'Supermarket Type2',
'Supermarket Type3'], dtype=object)]
0.5549992903957147
0.5954067732342189
0.59660376323206670
2067.0864
Sales Value is between 1353.13642578125 and 2781.03642578125