Learning Concepts Hackers Realm
Learning Concepts Hackers Realm
1 Data Normalization
[1]: import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
warnings.filterwarnings('ignore')
%matplotlib inline
[2]: df = pd.read_csv('data/winequality.csv')
df.head()
[2]: type fixed acidity volatile acidity citric acid residual sugar \
0 white 7.0 0.27 0.36 20.7
1 white 6.3 0.30 0.34 1.6
2 white 8.1 0.28 0.40 6.9
3 white 7.2 0.23 0.32 8.5
4 white 7.2 0.23 0.32 8.5
[3]: df.describe()
1
[3]: fixed acidity volatile acidity citric acid residual sugar \
count 6487.000000 6489.000000 6494.000000 6495.000000
mean 7.216579 0.339691 0.318722 5.444326
std 1.296750 0.164649 0.145265 4.758125
min 3.800000 0.080000 0.000000 0.600000
25% 6.400000 0.230000 0.250000 1.800000
50% 7.000000 0.290000 0.310000 3.000000
75% 7.700000 0.400000 0.390000 8.100000
max 15.900000 1.580000 1.660000 65.800000
2
[20]: sns.distplot(df['alcohol'])
3
1.1 Max absolute scaling
[10]: ## value / max_value
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)
[22]: sns.distplot(df_temp['alcohol'])
4
[22]: <AxesSubplot:xlabel='alcohol', ylabel='Density'>
[25]: sns.distplot(df_temp['alcohol'])
5
[ ]: # original_value = scaled_value * (max-min) + min
6
[28]: df_temp = df.copy()
7
[ ]:
[ ]:
8
[39]: sns.distplot(df['pH'])
9
[41]: scaled_data = df.copy()
[44]: sns.distplot(scaled_data['pH'])
10
[45]: from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
[47]: sc.fit(df[['pH']])
[47]: StandardScaler()
[56]: sns.distplot(df['pH'])
11
[55]: sns.distplot(sc_data)
[55]: <AxesSubplot:ylabel='Density'>
12
[ ]:
[ ]:
13
[6]: # to see outliers clearly
sns.boxplot(df['residual sugar'])
14
2.1 Z-score method
[41]: # find the limits
upper_limit = df['residual sugar'].mean() + 3*df['residual sugar'].std()
lower_limit = df['residual sugar'].mean() - 3*df['residual sugar'].std()
print('upper limit:', upper_limit)
print('lower limit:', lower_limit)
[42]: type fixed acidity volatile acidity citric acid residual sugar \
0 white 7.0 0.270 0.36 20.70
7 white 7.0 0.270 0.36 20.70
182 white 6.8 0.280 0.40 22.00
191 white 6.8 0.280 0.40 22.00
292 white 7.4 0.280 0.42 19.80
444 white 6.9 0.240 0.36 20.80
1454 white 8.3 0.210 0.49 19.80
1608 white 6.9 0.270 0.49 23.50
1653 white 7.9 0.330 0.28 31.60
1663 white 7.9 0.330 0.28 31.60
2489 white 6.1 0.280 0.24 19.95
2492 white 6.1 0.280 0.24 19.95
2620 white 6.5 0.280 0.28 20.40
2781 white 7.8 0.965 0.60 65.80
2785 white 6.4 0.240 0.25 20.20
2787 white 6.4 0.240 0.25 20.20
3014 white 7.0 0.450 0.34 19.80
3023 white 7.0 0.450 0.34 19.80
3420 white 7.6 0.280 0.49 20.15
3497 white 7.7 0.430 1.00 19.95
3547 white 7.3 0.200 0.29 19.90
3619 white 6.8 0.450 0.28 26.05
3623 white 6.8 0.450 0.28 26.05
3730 white 6.2 0.220 0.20 20.80
4107 white 6.8 0.300 0.26 20.30
4480 white 5.9 0.220 0.45 22.60
15
7 0.045 45.0 170.0 1.00100 3.00
182 0.048 48.0 167.0 1.00100 2.93
191 0.048 48.0 167.0 1.00100 2.93
292 0.066 53.0 195.0 1.00000 2.96
444 0.031 40.0 139.0 0.99750 3.20
1454 0.054 50.0 231.0 1.00120 2.99
1608 0.057 59.0 235.0 1.00240 2.98
1653 0.053 35.0 176.0 1.01030 3.15
1663 0.053 35.0 176.0 1.01030 3.15
2489 0.074 32.0 174.0 0.99922 3.19
2492 0.074 32.0 174.0 0.99922 3.19
2620 0.041 40.0 144.0 1.00020 3.14
2781 0.074 8.0 160.0 1.03898 3.39
2785 0.083 35.0 157.0 0.99976 3.17
2787 0.083 35.0 157.0 0.99976 3.17
3014 0.040 12.0 67.0 0.99760 3.07
3023 0.040 12.0 67.0 0.99760 3.07
3420 0.060 30.0 145.0 1.00196 3.01
3497 0.032 42.0 164.0 0.99742 3.29
3547 0.039 69.0 237.0 1.00037 3.10
3619 0.031 27.0 122.0 1.00295 3.06
3623 0.031 27.0 122.0 1.00295 3.06
3730 0.035 58.0 184.0 1.00022 3.11
4107 0.037 45.0 150.0 0.99727 3.04
4480 0.120 55.0 122.0 0.99636 3.10
16
3547 0.48 9.2 6
3619 0.42 10.6 6
3623 0.42 10.6 6
3730 0.53 9.0 6
4107 0.38 12.3 6
4480 0.35 12.8 5
[45]: # capping - change the outlier values to upper (or) lower limit values
new_df = df.copy()
17
new_df.loc[(new_df['residual sugar']>=upper_limit), 'residual sugar'] =␣
↪upper_limit
[47]: len(new_df)
[47]: 6497
18
[50]: upper_limit = q3 + (1.5 * iqr)
lower_limit = q1 - (1.5 * iqr)
lower_limit, upper_limit
[52]: type fixed acidity volatile acidity citric acid residual sugar \
0 white 7.0 0.270 0.36 20.70
7 white 7.0 0.270 0.36 20.70
14 white 8.3 0.420 0.62 19.25
38 white 7.3 0.240 0.39 17.95
39 white 7.3 0.240 0.39 17.95
… … … … … …
4691 white 6.9 0.190 0.31 19.25
4694 white 6.9 0.190 0.31 19.25
4748 white 6.1 0.340 0.24 18.35
4749 white 6.2 0.350 0.25 18.40
19
4778 white 5.8 0.315 0.19 19.40
20
[55]: # capping - change the outlier values to upper (or) lower limit values
new_df = df.copy()
new_df.loc[(new_df['residual sugar']>upper_limit), 'residual sugar'] =␣
↪upper_limit
21
2.3 Percentile method
[57]: upper_limit = df['residual sugar'].quantile(0.99)
lower_limit = df['residual sugar'].quantile(0.01)
print('upper limit:', upper_limit)
print('lower limit:', lower_limit)
22
[59]: # find the outliers
df.loc[(df['residual sugar'] > upper_limit) | (df['residual sugar'] <␣
↪lower_limit)]
[59]: type fixed acidity volatile acidity citric acid residual sugar \
0 white 7.0 0.270 0.36 20.70
7 white 7.0 0.270 0.36 20.70
14 white 8.3 0.420 0.62 19.25
103 white 7.5 0.305 0.40 18.90
111 white 7.2 0.270 0.46 18.75
… … … … … …
4749 white 6.2 0.350 0.25 18.40
4778 white 5.8 0.315 0.19 19.40
4779 white 6.0 0.590 0.00 0.80
4877 white 5.9 0.540 0.00 0.80
4897 white 6.0 0.210 0.38 0.80
23
4778 0.031 28.0 106.0 0.99704 2.97
4779 0.037 30.0 95.0 0.99032 3.10
4877 0.032 12.0 82.0 0.99286 3.25
4897 0.020 22.0 98.0 0.98941 3.26
24
[62]: # capping - change the outlier values to upper (or) lower limit values
new_df = df.copy()
new_df.loc[(new_df['residual sugar']>upper_limit), 'residual sugar'] =␣
↪upper_limit
25
[65]: sns.distplot(df['residual sugar'])
26
[64]: sns.distplot(new_df['residual sugar'])
[ ]:
[7]: season
0 summer
1 autumn
2 spring
3 winter
4 autumn
27
df['season_label'] = le.fit_transform(df['season'])
df.head()
[ ]:
[14]: ohe.fit_transform(df[['season']]).toarray()
28
[15]: ohe_values = ohe.fit_transform(df[['season']]).toarray()
ohe_df = pd.DataFrame(ohe_values)
enc_df = pd.concat([df, ohe_df], axis=1)
enc_df.head()
enc_df.head()
[ ]:
29
1 1.0 Rural N
2 1.0 Urban Y
3 1.0 Urban Y
4 1.0 Urban Y
df.head()
30
2 LP001005 Male Yes 0 Graduate Yes
3 LP001006 Male Yes 0 Not Graduate No
4 LP001008 Male No 0 Graduate No
[25]: df.sample(frac=1).head(10)
31
71 1.0 Semiurban 1 0.693252 0.752475
474 1.0 Rural 1 0.693252 0.752475
266 1.0 Semiurban 1 0.693252 0.752475
541 0.0 Semiurban 0 0.669643 0.689855
354 1.0 Semiurban 1 0.669643 0.689855
116 1.0 Semiurban 1 0.669643 0.689855
16 NaN Urban 1 0.693252 0.647059
598 1.0 Rural 1 0.693252 0.689855
[31]: df.groupby('Education').size()
[31]: Education
Graduate 480
Not Graduate 134
dtype: int64
32
[32]: Education Self_Employed Dependents Education_freq
0 Graduate No 0 0.781759
1 Graduate No 1 0.781759
2 Graduate Yes 0 0.781759
3 Not Graduate No 0 0.218241
4 Graduate No 0 0.781759
Self_Employed_1
291 1
594 0
179 1
401 1
443 1
26 1
219 1
483 1
340 1
273 1
33
[ ]:
[5]: Datetime
0 2012-08-25 00:00:00
1 2012-08-25 01:00:00
2 2012-08-25 02:00:00
3 2012-08-25 03:00:00
4 2012-08-25 04:00:00
34
63 2012-08-27 15:00:00 2012 8 27 3 0 35
59 2012-08-27 11:00:00 2012 8 27 3 0 35
54 2012-08-27 06:00:00 2012 8 27 3 0 35
is_weekend
4 1
37 1
49 0
28 1
88 0
32 1
58 0
63 0
59 0
54 0
35
is_weekend hour minute second date time
0 1 0 0 0 2012-08-25 00:00:00
1 1 1 0 0 2012-08-25 01:00:00
2 1 2 0 0 2012-08-25 02:00:00
3 1 3 0 0 2012-08-25 03:00:00
4 1 4 0 0 2012-08-25 04:00:00
difference
0 3522 days 13:24:49.747950
1 3522 days 12:24:49.747950
2 3522 days 11:24:49.747950
3 3522 days 10:24:49.747950
4 3522 days 09:24:49.747950
[ ]:
36
4 LP001008 Male No 0 Graduate No
[25]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 22
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
[38]: Loan_ID 0
Gender 0
Married 0
Dependents 0
Education 0
Self_Employed 0
37
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 0
Property_Area 0
Loan_Status 0
dtype: int64
[40]: df['Gender'].value_counts()
[42]: new_df['Gender'].value_counts()
[44]: len(df)
[44]: 614
[45]: 480
[46]: new_df.isnull().sum()
[46]: Loan_ID 0
Gender 0
38
Married 0
Dependents 0
Education 0
Self_Employed 0
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 0
Property_Area 0
Loan_Status 0
dtype: int64
[48]: df['LoanAmount'].mean()
[48]: 146.41216216216216
[49]: sns.distplot(df['LoanAmount'])
39
[50]: # fill missing value for numerical
new_df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
new_df.isnull().sum()
[50]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
new_df.isnull().sum()
[52]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
[53]: sns.countplot(df['Self_Employed'])
40
[55]: df['Self_Employed'].mode()[0]
[55]: 'No'
new_df.isnull().sum()
[56]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 0
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
41
4.0.5 Fill missing value based on grouping category
[62]: df.groupby('Loan_Status').mean()['LoanAmount']
[62]: Loan_Status
N 151.220994
Y 144.294404
Name: LoanAmount, dtype: float64
[63]: mean_df['N']
[63]: 151.22099447513813
[70]: new_df.isnull().sum()
[70]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
N
Y
42
[73]: mean_df = df.groupby('Loan_Status').mean()['Loan_Amount_Term']
mean_df
[73]: Loan_Status
N 344.064516
Y 341.072464
Name: Loan_Amount_Term, dtype: float64
[80]: df.groupby('Loan_Status')['Loan_Amount_Term'].agg(pd.Series.mean)
[80]: Loan_Status
N 344.064516
Y 341.072464
Name: Loan_Amount_Term, dtype: float64
[77]: new_df.isnull().sum()
[77]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
[79]: Loan_Status
N No
Y No
Name: Self_Employed, dtype: object
43
new_df.loc[(new_df['Loan_Status']==val), 'Self_Employed'] = new_df.
loc[(new_df['Loan_Status']==val), 'Self_Employed'].fillna(mode_df[val])
↪
[82]: new_df.isnull().sum()
[82]: Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 0
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 0
Loan_Amount_Term 0
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
new_df.head()
[93]: len(new_df)
[93]: 614
LoanAmount 592
44
[96]: # input and output split
X = new_df_temp.drop(columns=[col], axis=1)
y = new_df_temp[col]
[97]: LGBMRegressor(use_missing=False)
[98]: d = {}
temp = new_df.drop(columns=[col], axis=1)
d[col] = list(model.predict(temp))
[99]: i = 0
for val, d_val in zip(new_df[col], d[col]):
if pd.isna(val):
new_df.at[i, col] = d_val
i += 1
[100]: new_df.isnull().sum()
[100]: LoanAmount 0
Loan_Amount_Term 14
ApplicantIncome 0
CoapplicantIncome 0
dtype: int64
[101]: new_df.head()
[ ]:
[ ]:
45
5 Feature Selection Techniques
5.1 Correlation Matrix (Numerical Attributes)
[2]: df = pd.read_csv('data/bike sharing dataset.csv')
df.head()
46
hr -0.003498 0.002285 -0.020203 0.137603 0.133750 -0.276498
holiday -0.102088 -0.252471 -0.017036 -0.027340 -0.030973 -0.010588
weekday 1.000000 0.035955 0.003311 -0.001795 -0.008821 -0.037158
workingday 0.035955 1.000000 0.044672 0.055390 0.054667 0.015688
weathersit 0.003311 0.044672 1.000000 -0.102640 -0.105563 0.418130
temp -0.001795 0.055390 -0.102640 1.000000 0.987672 -0.069881
atemp -0.008821 0.054667 -0.105563 0.987672 1.000000 -0.051918
hum -0.037158 0.015688 0.418130 -0.069881 -0.051918 1.000000
windspeed 0.011502 -0.011830 0.026226 -0.023125 -0.062336 -0.290105
casual 0.032721 -0.300942 -0.152628 0.459616 0.454080 -0.347028
registered 0.021578 0.134326 -0.120966 0.335361 0.332559 -0.273933
cnt 0.026900 0.030284 -0.142426 0.404772 0.400929 -0.322911
[8]: <AxesSubplot:>
47
5.2 Chi-Square (Categorical Attributes)
[10]: df = pd.read_csv('data/Loan Prediction Dataset.csv')
df = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',␣
↪'Credit_History', 'Property_Area', 'Loan_Status']]
Property_Area Loan_Status
0 Urban Y
1 Rural N
2 Urban Y
48
3 Urban Y
4 Urban Y
Property_Area Loan_Status
0 2 1
1 0 0
2 2 1
3 2 1
4 2 1
[15]: chi_scores
[16]: <AxesSubplot:>
49
[17]: # if p-value > 0.05, lower the importance
p_values = pd.Series(chi_scores[1], index=X.columns)
p_values.sort_values(ascending=False, inplace=True)
p_values.plot.bar()
[17]: <AxesSubplot:>
50
5.3 Recursive Feature Elimination (RFE)
[18]: df.head()
Property_Area Loan_Status
0 2 1
1 0 0
2 2 1
3 2 1
4 2 1
51
[20]: # input split
X = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']
[ ]:
52
[6]: from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
[ ]:
[9]: df = pd.read_csv('data/creditcard.csv')
df.head()
[9]: Time V1 V2 V3 V4 V5 V6 V7 \
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941
53
0 -0.189115 0.133558 -0.021053 149.62 0
1 0.125895 -0.008983 0.014724 2.69 0
2 -0.139097 -0.055353 -0.059752 378.66 0
3 -0.221929 0.062723 0.061458 123.50 0
4 0.502292 0.219422 0.215153 69.99 0
[5 rows x 31 columns]
[12]: sns.countplot(y)
[14]: Counter(y)
54
7.1 Over Sampling Techniques
7.1.1 RandomOverSampler
[22]: sns.countplot(y_over)
55
7.2 Under Sampling Technique
7.2.1 RandomUnderSampler
[31]: sns.countplot(y_under)
56
7.3 Combine Oversampling and Undersampling
[37]: from imblearn.pipeline import Pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
pipeline = Pipeline([('o', over), ('u', under)])
X_resample, y_resample = pipeline.fit_resample(X, y)
[38]: sns.countplot(y)
57
[39]: sns.countplot(y_resample)
58
[ ]:
8 Ensembling Techniques
[46]: df = pd.read_csv('data/winequality.csv')
df = df.drop(columns=['type'], axis=1)
df = df.fillna(-2)
df.head()
[46]: fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.0 0.27 0.36 20.7 0.045
1 6.3 0.30 0.34 1.6 0.049
2 8.1 0.28 0.40 6.9 0.050
3 7.2 0.23 0.32 8.5 0.058
4 7.2 0.23 0.32 8.5 0.058
alcohol quality
0 8.8 6
1 9.5 6
2 10.1 6
3 9.9 6
4 9.9 6
[49]: 0.4664615384615385
59
8.1 Voting Classifier
[52]: from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = RandomForestClassifier()
↪class
model.fit(x_train, y_train)
model.score(x_test, y_test)
[52]: 0.6344615384615384
8.2 Averaging
[53]: model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = RandomForestClassifier()
model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)
pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model3.predict_proba(x_test)
final_pred = (pred1+pred2+pred3)/3
[56]: sns.countplot(y)
60
[58]: final_pred
[79]: pred = []
for res in final_pred:
pred.append(np.argmax(res)+3)
[80]: 0.6350769230769231
61
8.3 Weighted Average
[85]: model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = RandomForestClassifier()
model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)
pred1 = model1.predict_proba(x_test)
pred2 = model2.predict_proba(x_test)
pred3 = model3.predict_proba(x_test)
final_pred = (pred1*0.25+pred2*0.25+pred3*0.5)/3
[86]: pred = []
for res in final_pred:
pred.append(np.argmax(res)+3)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)
[86]: 0.6652307692307692
[ ]:
62
[5]: from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from umap import UMAP
9.1 PCA
[13]: x_pca = PCA(n_components=2).fit_transform(X)
[14]: x_pca.shape
[14]: (60000, 2)
[21]: plt.figure(figsize=(10,10))
sc = plt.scatter(x_pca[:, 0], x_pca[:, 1], c=y)
plt.legend(handles=sc.legend_elements()[0], labels=list(range(10)))
plt.show()
63
9.2 LDA
[22]: x_lda = LDA(n_components=2).fit_transform(X, y)
[23]: plt.figure(figsize=(10,10))
sc = plt.scatter(x_lda[:, 0], x_lda[:, 1], c=y)
plt.legend(handles=sc.legend_elements()[0], labels=list(range(10)))
plt.show()
64
9.3 t-SNE
[12]: # taking only 10k samples for quick results
x_tsne = TSNE(n_jobs=-1).fit_transform(X[:10000])
[13]: plt.figure(figsize=(10,10))
sc = plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y[:10000])
plt.legend(handles=sc.legend_elements()[0], labels=list(range(10)))
plt.show()
65
9.4 UMAP
[6]: x_umap = UMAP(n_neighbors=10, min_dist=0.1, metric='correlation').
↪fit_transform(X)
[7]: plt.figure(figsize=(10,10))
sc = plt.scatter(x_umap[:, 0], x_umap[:, 1], c=y)
plt.legend(handles=sc.legend_elements()[0], labels=list(range(10)))
plt.show()
[ ]:
66
10 Handle Large Data (CSV)
[2]: df = pd.read_csv('data/1000000 Sales Records.csv')
df.head()
Order Priority Order Date Order ID Ship Date Units Sold Unit Price \
0 M 7/27/2012 443368995 7/28/2012 1593 9.33
1 M 9/14/2013 667593514 10/19/2013 4611 109.28
2 M 5/15/2015 940995585 6/4/2015 360 421.89
3 H 5/17/2017 880811536 7/2/2017 562 109.28
4 L 10/26/2016 174590194 12/4/2016 3973 47.45
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 14 entries, Region to Total Profit
dtypes: float64(5), int64(2), object(7)
memory usage: 489.9 MB
10.1 nrows
[4]: df = pd.read_csv('data/1000000 Sales Records.csv', nrows=1000)
df.info(verbose=False, memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 14 entries, Region to Total Profit
dtypes: float64(5), int64(2), object(7)
memory usage: 502.1 KB
67
[6]: array(['Region', 'Country', 'Item Type', 'Sales Channel',
'Order Priority', 'Order Date', 'Order ID', 'Ship Date',
'Units Sold', 'Unit Price', 'Unit Cost', 'Total Revenue',
'Total Cost', 'Total Profit'], dtype=object)
Order Priority Units Sold Unit Price Unit Cost Total Revenue \
0 M 1593 9.33 6.92 14862.69
1 M 4611 109.28 35.84 503890.08
2 M 360 421.89 364.69 151880.40
3 H 562 109.28 35.84 61415.36
4 L 3973 47.45 31.79 188518.85
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 11 entries, Region to Total Profit
dtypes: float64(5), int64(1), object(5)
memory usage: 356.5 MB
68
[10]: Units Sold Unit Price Unit Cost Total Revenue \
count 1000000.000000 1000000.000000 1000000.000000 1.000000e+06
mean 4998.867302 266.025488 187.522978 1.329563e+06
std 2885.334142 216.987966 175.650798 1.468527e+06
min 1.000000 9.330000 6.920000 9.330000e+00
25% 2502.000000 81.730000 35.840000 2.778672e+05
50% 4998.000000 154.060000 97.440000 7.844445e+05
75% 7496.000000 421.890000 263.330000 1.822444e+06
max 10000.000000 668.270000 524.960000 6.682700e+06
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 11 entries, Region to Total Profit
dtypes: category(5), float16(5), int16(1)
memory usage: 17.2 MB
df.info(verbose=False, memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 11 entries, Region to Total Profit
dtypes: category(1), float64(5), int16(1), object(4)
memory usage: 282.3 MB
69
10.3 Load Dataset Faster using chunks
[17]: %%time
df = pd.read_csv('data/1000000 Sales Records.csv')
len(df)
[17]: 1000000
[19]: %%time
chunks = pd.read_csv('data/1000000 Sales Records.csv', iterator=True,␣
↪chunksize=1000)
# df = pd.concat(chunks, ignore_index=True)
# df.head()
[20]: length = 0
for chunk in chunks:
length += len(chunk)
length
[20]: 1000000
[ ]:
11 Sampling Techniques
[1]: import pandas as pd
df = pd.read_csv('data/winequality.csv')
df.head()
[5 rows x 13 columns]
[2]: len(df)
[2]: 6497
70
11.0.1 Random Sample
[5 rows x 13 columns]
[4]: len(sample_df)
[4]: 500
[5 rows x 13 columns]
[10]: len(sample_df)
[10]: 10000
71
1 white 6.3 0.30 … 0.49 9.5 6
2 white 8.1 0.28 … 0.44 10.1 6
3 white 7.2 0.23 … 0.40 9.9 6
4 white 7.2 0.23 … 0.40 9.9 6
[5 rows x 13 columns]
X = df.drop(columns=['quality'])
y = df['quality']
[16]: sns.countplot(x=y_test)
72
[16]: <Axes: xlabel='quality', ylabel='count'>
[18]: sns.countplot(x=y_test)
73
[ ]:
[5 rows x 13 columns]
74
[20]: df = df.drop(columns=['type'])
df = df.fillna(-2)
df.head(2)
[2 rows x 12 columns]
[21]: X = df.drop(columns=['quality'])
y = df['quality']
75
Ridge MSE: 0.48188801180027196
[42]: lasso_model.coef_
[45]: ridge_model.coef_
76
plt.ylabel('Coeffients')
plt.title('Coefficients of Features')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
[ ]:
13 Pipeline Module
[1]: from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
77
[6]: # build the pipeline
pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scalar', StandardScaler()),
('model', LogisticRegression())
])
Accuracy: 1.0
Predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1
0 0 2 1
0]
[ ]:
[ ]:
[ ]:
[ ]:
78