import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pen_spark Generate a slider using jupyter widgets search Close
df = pd.read_csv('/DiwaliSalesData.csv', encoding='unicode_escape')
df.head()
df.tail()
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount S
Group
11246 1000695 Manning P00296942 M 18-25 19 1 Maharashtra Western Chemical Office 4 370.0
11247 1004089 Reichenbach P00171342 M 26-35 33 0 Haryana Northern Healthcare Veterinary 3 367.0
Madhya
11248 1001209 Oshin P00201342 F 36-45 40 0 Central Textile Office 4 213.0
Pradesh
11249 1004023 Noonan P00059442 M 36-45 37 0 Karnataka Southern Agriculture Office 3 206.0
11250 1002744 Brumley P00281742 F 18-25 19 0 Maharashtra Western Healthcare Office 3 188.0
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
13 Status 0 non-null float64
14 unnamed1 0 non-null float64
dtypes: float64(3), int64(4), object(8)
memory usage: 1.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
13 Status 0 non-null float64
14 unnamed1 0 non-null float64
dtypes: float64(3), int64(4), object(8)
memory usage: 1.3+ MB
df.isnull()
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount Status
Group
0 False False False False False False False False False False False False False True
1 False False False False False False False False False False False False False True
2 False False False False False False False False False False False False False True
3 False False False False False False False False False False False False False True
4 False False False False False False False False False False False False False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11246 False False False False False False False False False False False False False True
11247 False False False False False False False False False False False False False True
11248 False False False False False False False False False False False False False True
11249 False False False False False False False False False False False False False True
11250 False False False False False False False False False False False False False True
df.drop(['Status', 'unnamed1'], axis=1)
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount
Group
0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 23952.
1 1000732 Kartik P00110942 F 26-35 35 1 Andhra Pradesh Southern Govt Auto 3 23934.
2 1001990 Bindu P00118542 F 26-35 35 1 Uttar Pradesh Central Automobile Auto 3 23924.
3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 23912.
Food
4 1000588 Joni P00057942 M 26-35 28 1 Gujarat Western Auto 2 23877.
Processing
... ... ... ... ... ... ... ... ... ... ... ... ... .
11246 1000695 Manning P00296942 M 18-25 19 1 Maharashtra Western Chemical Office 4 370.
11247 1004089 Reichenbach P00171342 M 26-35 33 0 Haryana Northern Healthcare Veterinary 3 367.
Madhya
11248 1001209 Oshin P00201342 F 36-45 40 0 Central Textile Office 4 213.
Pradesh
11249 1004023 Noonan P00059442 M 36-45 37 0 Karnataka Southern Agriculture Office 3 206.
df["Amount"].median()
8109.0
df.isna()
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount Status
Group
0 False False False False False False False False False False False False False True
1 False False False False False False False False False False False False False True
2 False False False False False False False False False False False False False True
3 False False False False False False False False False False False False False True
4 False False False False False False False False False False False False False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11246 False False False False False False False False False False False False False True
11247 False False False False False False False False False False False False False True
11248 False False False False False False False False False False False False False True
11249 False False False False False False False False False False False False False True
11250 False False False False False False False False False False False False False True
df.isna().sum()
0
User_ID 0
Cust_name 0
Product_ID 0
Gender 0
Age Group 0
Age 0
Marital_Status 0
State 0
Zone 0
Occupation 0
Product_Category 0
Orders 0
Amount 12
Status 11251
unnamed1 11251
dtype: int64
df.duplicated()
0 False
1 False
2 False
3 False
4 False
... ...
11246 False
11247 False
11248 False
11249 False
11250 False
11251 rows × 1 columns
dtype: bool
df.duplicated().sum()
df.drop_duplicates()
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount
Group
0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 23952.
1 1000732 Kartik P00110942 F 26-35 35 1 Andhra Pradesh Southern Govt Auto 3 23934.
2 1001990 Bindu P00118542 F 26-35 35 1 Uttar Pradesh Central Automobile Auto 3 23924.
3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 23912.
Food
4 1000588 Joni P00057942 M 26-35 28 1 Gujarat Western Auto 2 23877.
Processing
... ... ... ... ... ... ... ... ... ... ... ... ... .
11246 1000695 Manning P00296942 M 18-25 19 1 Maharashtra Western Chemical Office 4 370.
11247 1004089 Reichenbach P00171342 M 26-35 33 0 Haryana Northern Healthcare Veterinary 3 367.
Madhya
11248 1001209 Oshin P00201342 F 36-45 40 0 Central Textile Office 4 213.
Pradesh
11249 1004023 Noonan P00059442 M 36-45 37 0 Karnataka Southern Agriculture Office 3 206.
11250 1002744 Brumley P00281742 F 18-25 19 0 Maharashtra Western Healthcare Office 3 188.
11243 rows × 15 columns
df.duplicated()
0 False
1 False
2 False
3 False
4 False
... ...
11246 False
11247 False
11248 False
11249 False
11250 False
11251 rows × 1 columns
df.describe()
User_ID Age Marital_Status Orders Amount Status unnamed1
count 1.125100e+04 11251.000000 11251.000000 11251.000000 11239.000000 0.0 0.0
mean 1.003004e+06 35.421207 0.420318 2.489290 9453.610858 NaN NaN
std 1.716125e+03 12.754122 0.493632 1.115047 5222.355869 NaN NaN
min 1.000001e+06 12.000000 0.000000 1.000000 188.000000 NaN NaN
25% 1.001492e+06 27.000000 0.000000 1.500000 5443.000000 NaN NaN
50% 1.003065e+06 33.000000 0.000000 2.000000 8109.000000 NaN NaN
75% 1.004430e+06 43.000000 1.000000 3.000000 12675.000000 NaN NaN
1 006040 06 92 000000 1 000000 4 000000 23952 000000 N N N N
df.drop(['Status', 'unnamed1'], axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
dtypes: float64(1), int64(4), object(8)
memory usage: 1.1+ MB
df.describe()
User_ID Age Marital_Status Orders Amount
count 1.125100e+04 11251.000000 11251.000000 11251.000000 11239.000000
mean 1.003004e+06 35.421207 0.420318 2.489290 9453.610858
std 1.716125e+03 12.754122 0.493632 1.115047 5222.355869
min 1.000001e+06 12.000000 0.000000 1.000000 188.000000
25% 1.001492e+06 27.000000 0.000000 1.500000 5443.000000
50% 1.003065e+06 33.000000 0.000000 2.000000 8109.000000
75% 1.004430e+06 43.000000 1.000000 3.000000 12675.000000
max 1.006040e+06 92.000000 1.000000 4.000000 23952.000000
Distributions
2-d distributions
Values
plt.figure(figsize=(4,4))
sns.barplot(x=df['Gender'],y=df['Amount'],data=df, estimator=sum)
plt.show()
plt.figure(figsize=(4,4))
sns.barplot(x='Marital_Status',y='Amount',data=df)
plt.show()
plt.figure(figsize=(10,10))
sns.lineplot(x='State',y='Amount',data=df)
plt.show()
add Code add Text
plt.figure(figsize=(8,8))
sns.lineplot(x='Occupation',y='Amount',data=df)
plt.show()