#WEEK4
#NAME: V.Vyaswanth
#Roll No : 23071A66K4
#23071A66K2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv("auto-mpg.csv")
df.head()
{"summary":"{\n \"name\": \"df\",\n \"rows\": 398,\n \"fields\": [\
n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 115,\n \"min\": 0,\n
\"max\": 397,\n \"num_unique_values\": 398,\n
\"samples\": [\n 198,\n 396,\n 33\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"mpg\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.815984312565782,\n \"min\": 9.0,\n \"max\":
46.6,\n \"num_unique_values\": 129,\n \"samples\": [\n
17.7,\n 30.5,\n 30.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"cylinders\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
1,\n \"min\": 3,\n \"max\": 8,\n
\"num_unique_values\": 5,\n \"samples\": [\n 4,\n
5,\n 6\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"displacement\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 104.26983817119581,\n \"min\":
68.0,\n \"max\": 455.0,\n \"num_unique_values\": 82,\n
\"samples\": [\n 122.0,\n 307.0,\n 360.0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"horsepower\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 94,\n \"samples\": [\n
\"112\",\n \"?\",\n \"78\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"weight\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 846,\n
\"min\": 1613,\n \"max\": 5140,\n \"num_unique_values\":
351,\n \"samples\": [\n 3730,\n 1995,\n
2215\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"acceleration\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 2.497555013249332,\n \"min\":
9.0,\n \"max\": 22.1,\n \"num_unique_values\": 89,\n
\"samples\": [\n 16.7,\n 15.8,\n 12.8\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"model year\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
3,\n \"min\": 70,\n \"max\": 82,\n
\"num_unique_values\": 13,\n \"samples\": [\n 81,\n
79,\n 70\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"origin\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 3,\n
\"num_unique_values\": 3,\n \"samples\": [\n 1,\n
3,\n 2\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"car
name\",\n \"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 305,\n \"samples\": [\n \"mazda
rx-4\",\n \"ford f108\",\n \"buick century luxus
(sw)\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}
sns.boxplot(df['mpg'],orient='h')
<Axes: xlabel='mpg'>
1. Removing outliers / missing values
baddata = df[df['horsepower'] == '?']
baddata
{"summary":"{\n \"name\": \"baddata\",\n \"rows\": 6,\n \"fields\":
[\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 143,\n \"min\": 32,\n
\"max\": 374,\n \"num_unique_values\": 6,\n \"samples\":
[\n 32,\n 126,\n 374\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"mpg\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.886951248739908,\n
\"min\": 21.0,\n \"max\": 40.9,\n \"num_unique_values\":
6,\n \"samples\": [\n 25.0,\n 21.0,\n
23.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"cylinders\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 0,\n \"min\": 4,\n
\"max\": 6,\n \"num_unique_values\": 2,\n \"samples\":
[\n 6,\n 4\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"displacement\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 43.32204981299938,\n
\"min\": 85.0,\n \"max\": 200.0,\n
\"num_unique_values\": 6,\n \"samples\": [\n 98.0,\n
200.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"horsepower\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 1,\n \"samples\":
[\n \"?\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"weight\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 504,\n \"min\": 1835,\n \"max\": 3035,\n
\"num_unique_values\": 6,\n \"samples\": [\n 2046\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"acceleration\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
2.212163345385387,\n \"min\": 14.3,\n \"max\": 20.5,\n
\"num_unique_values\": 6,\n \"samples\": [\n 19.0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"model year\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
4,\n \"min\": 71,\n \"max\": 82,\n
\"num_unique_values\": 5,\n \"samples\": [\n 74\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"origin\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 0,\n
\"min\": 1,\n \"max\": 2,\n \"num_unique_values\": 2,\n
\"samples\": [\n 2\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"car name\",\n \"properties\": {\n \"dtype\":
\"string\",\n \"num_unique_values\": 6,\n \"samples\":
[\n \"ford pinto\"\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"baddata"}
arr= df['horsepower'].values
print(arr)
['130' '165' '150' '150' '140' '198' '220' '215' '225' '190' '170'
'160'
'150' '225' '95' '95' '97' '85' '88' '46' '87' '90' '95' '113' '90'
'215'
'200' '210' '193' '88' '90' '95' '?' '100' '105' '100' '88' '100'
'165'
'175' '153' '150' '180' '170' '175' '110' '72' '100' '88' '86' '90'
'70'
'76' '65' '69' '60' '70' '95' '80' '54' '90' '86' '165' '175' '150'
'153'
'150' '208' '155' '160' '190' '97' '150' '130' '140' '150' '112' '76'
'87' '69' '86' '92' '97' '80' '88' '175' '150' '145' '137' '150'
'198'
'150' '158' '150' '215' '225' '175' '105' '100' '100' '88' '95' '46'
'150' '167' '170' '180' '100' '88' '72' '94' '90' '85' '107' '90'
'145'
'230' '49' '75' '91' '112' '150' '110' '122' '180' '95' '?' '100'
'100'
'67' '80' '65' '75' '100' '110' '105' '140' '150' '150' '140' '150'
'83'
'67' '78' '52' '61' '75' '75' '75' '97' '93' '67' '95' '105' '72'
'72'
'170' '145' '150' '148' '110' '105' '110' '95' '110' '110' '129' '75'
'83' '100' '78' '96' '71' '97' '97' '70' '90' '95' '88' '98' '115'
'53'
'86' '81' '92' '79' '83' '140' '150' '120' '152' '100' '105' '81'
'90'
'52' '60' '70' '53' '100' '78' '110' '95' '71' '70' '75' '72' '102'
'150'
'88' '108' '120' '180' '145' '130' '150' '68' '80' '58' '96' '70'
'145'
'110' '145' '130' '110' '105' '100' '98' '180' '170' '190' '149' '78'
'88' '75' '89' '63' '83' '67' '78' '97' '110' '110' '48' '66' '52'
'70'
'60' '110' '140' '139' '105' '95' '85' '88' '100' '90' '105' '85'
'110'
'120' '145' '165' '139' '140' '68' '95' '97' '75' '95' '105' '85'
'97'
'103' '125' '115' '133' '71' '68' '115' '85' '88' '90' '110' '130'
'129'
'138' '135' '155' '142' '125' '150' '71' '65' '80' '80' '77' '125'
'71'
'90' '70' '70' '65' '69' '90' '115' '115' '90' '76' '60' '70' '65'
'90'
'88' '90' '90' '78' '90' '75' '92' '75' '65' '105' '65' '48' '48'
'67'
'67' '67' '?' '67' '62' '132' '100' '88' '?' '72' '84' '84' '92'
'110'
'84' '58' '64' '60' '67' '65' '62' '68' '63' '65' '65' '74' '?' '75'
'75'
'100' '74' '80' '76' '116' '120' '110' '105' '88' '85' '88' '88' '88'
'85' '84' '90' '92' '?' '74' '68' '68' '63' '70' '88' '75' '70' '67'
'67'
'67' '110' '85' '92' '112' '96' '84' '90' '86' '52' '84' '79' '82']
df.isnull().sum()
Unnamed: 0 0
mpg 0
cylinders 0
displacement 0
horsepower 0
weight 0
acceleration 0
model year 0
origin 0
car name 0
dtype: int64
df.replace('?',np.nan,inplace=True)
df.isnull().sum()
Unnamed: 0 0
mpg 0
cylinders 0
displacement 0
horsepower 6
weight 0
acceleration 0
model year 0
origin 0
car name 0
dtype: int64
q1=df.mpg.quantile(0.25)
q3=df.mpg.quantile(0.75)
iqr=q3-q1
ll=q1-(1.5)*iqr
ul=q3+(1.5)*iqr
upper=np.where(df['mpg']>=ul)
lower=np.where(df['mpg']<=ll)
print("upper outliers",upper)
print("lower outliers",lower)
upper outliers (array([322]),)
lower outliers (array([], dtype=int64),)
df.drop(upper[0],inplace=True)
print(df.shape)
df.drop(lower[0],inplace=True)
print(df.shape)
(397, 10)
(397, 10)
sns.boxplot(df['mpg'],orient='h')
<Axes: xlabel='mpg'>
newdf=df.dropna()
newdf.shape
(391, 10)
2.Inputing standard values
df2=pd.read_csv("auto-mpg.csv")
df2.head()
df2.shape
(398, 10)
sns.boxplot(df['acceleration'],orient='h')
<Axes: xlabel='acceleration'>
df2.plot(kind="scatter",x='acceleration',y='mpg')
<Axes: xlabel='acceleration', ylabel='mpg'>
q1=df2.acceleration.quantile(0.25)
q3=df2.acceleration.quantile(0.75)
IQR=q3-q1
IQR
LL=q1-1.5*IQR
UL=q3+1.5*IQR
upper = np.where(df2['acceleration'] > UL)
lower = np.where(df2['acceleration'] < LL)
med=df2['acceleration'].quantile(0.50)
print('q1=',q1,'median=',med,'q3=',q3,'iqr=',iqr)
print(ll,ul)
print('lower=',lower,'upper=',upper)
q1= 14.0 median= 15.5 q3= 17.0 iqr= 11.5
0.25 46.25
lower= (array([6]),) upper= (array([196, 209, 325, 328]),)
arr= df2['acceleration'].values
true_index = (arr > LL) & (arr < UL)
true_index
array([ True, True, True, True, True, True, False, True, True,
True, True, True, False, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, False,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, False, True,
True, True, True, True, True, True, True, True, True,
True, True, False, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, False, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, False, True, True, False, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True, True,
True, True])
mid1=np.mean(df2['acceleration'][true_index])
mid1
np.float64(15.468240722430952)
false_index=~true_index # ~ is bool operator for inverse
df2['acceleration'].values[false_index]=mid1
print(np.where(df2['acceleration'] > UL))
(array([], dtype=int64),)
3. capping outliers with lower limit and
upper limit (using 5th percentile and 95th percentile)
df3=pd.read_csv('auto-mpg.csv')
df3.head()
{"summary":"{\n \"name\": \"df3\",\n \"rows\": 398,\n \"fields\":
[\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 115,\n \"min\": 0,\n
\"max\": 397,\n \"num_unique_values\": 398,\n
\"samples\": [\n 198,\n 396,\n 33\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"mpg\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.815984312565782,\n \"min\": 9.0,\n \"max\":
46.6,\n \"num_unique_values\": 129,\n \"samples\": [\n
17.7,\n 30.5,\n 30.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"cylinders\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
1,\n \"min\": 3,\n \"max\": 8,\n
\"num_unique_values\": 5,\n \"samples\": [\n 4,\n
5,\n 6\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"displacement\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 104.26983817119581,\n \"min\":
68.0,\n \"max\": 455.0,\n \"num_unique_values\": 82,\n
\"samples\": [\n 122.0,\n 307.0,\n 360.0\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"horsepower\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 94,\n \"samples\": [\n
\"112\",\n \"?\",\n \"78\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"weight\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 846,\n
\"min\": 1613,\n \"max\": 5140,\n \"num_unique_values\":
351,\n \"samples\": [\n 3730,\n 1995,\n
2215\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"acceleration\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 2.497555013249332,\n \"min\":
9.0,\n \"max\": 22.1,\n \"num_unique_values\": 89,\n
\"samples\": [\n 16.7,\n 15.8,\n 12.8\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"model year\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
3,\n \"min\": 70,\n \"max\": 82,\n
\"num_unique_values\": 13,\n \"samples\": [\n 81,\n
79,\n 70\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"origin\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 0,\n \"min\": 1,\n \"max\": 3,\n
\"num_unique_values\": 3,\n \"samples\": [\n 1,\n
3,\n 2\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\": \"car
name\",\n \"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 305,\n \"samples\": [\n \"mazda
rx-4\",\n \"ford f108\",\n \"buick century luxus
(sw)\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df3"}
max_threshold=df3['mpg'].quantile(0.95)
min_threshold=df3['mpg'].quantile(0.05)
print(max_threshold,min_threshold)
print(df3.loc[[322]])
37.029999999999994 13.0
Unnamed: 0 mpg cylinders displacement horsepower weight \
322 322 46.6 4 86.0 65 2110
acceleration model year origin car name
322 17.9 80 3 mazda glc
df3['mpg']=np.where(df3['mpg']>max_threshold,max_threshold,
np.where(df3['mpg']<min_threshold,min_threshold,df3['mpg']))
# this command finds the values and also replaces them
sns.boxplot(df3['mpg'],orient='h')
<Axes: xlabel='mpg'>