Entrée [49]: import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
Entrée [36]: # importation des données
df = pd.read_csv("C:/Users/PC/Desktop/2018_2020_waste.csv")
Entrée [37]: df.head()
Out[37]:
Waste Type Total Generated ('000 tonnes) Total Recycled ('000 tonnes) Year
0 Construction& Demolition 1624 1618 2018
1 Ferrous Metal 1269 126 2018
2 Paper/Cardboard 1054 586 2018
3 Plastics 949 41 2018
4 Food 763 126 2018
Entrée [38]: df.shape
Out[38]: (45, 4)
Entrée [39]: df.describe()
Out[39]:
Total Generated ('000 tonnes) Total Recycled ('000 tonnes) Year
count 45.000000 45.000000 45.000000
mean 1073.644444 508.688889 2019.000000
std 1951.504154 1035.912023 0.825723
min 23.000000 6.000000 2018.000000
25% 168.000000 25.000000 2018.000000
50% 313.000000 126.000000 2019.000000
75% 949.000000 428.000000 2020.000000
max 7695.000000 4726.000000 2020.000000
Entrée [40]: df.tail()
Out[40]:
Waste Type Total Generated ('000 tonnes) Total Recycled ('000 tonnes) Year
40 Non-ferrous metal 75 73 2020
41 Glass 66 7 2020
42 Scrap tyres 23 22 2020
43 Others (stones, ceramics, etc.) 193 21 2020
44 Overall 5880 3040 2020
Entrée [41]: sns.pairplot(df);
Entrée [42]: label_encode = LabelEncoder()
labels = label_encode.fit_transform(df['Waste Type'])
df['sortie'] = labels
df.drop(columns=['Waste Type'], axis=1, inplace=True)
df.head()
Out[42]:
Total Generated ('000 tonnes) Total Recycled ('000 tonnes) Year sortie
0 1624 1618 2018 3
1 1269 126 2018 4
2 1054 586 2018 14
3 949 41 2018 15
4 763 126 2018 6
Entrée [43]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Total Generated ('000 tonnes) 45 non-null int64
1 Total Recycled ('000 tonnes) 45 non-null int64
2 Year 45 non-null int64
3 sortie 45 non-null int32
dtypes: int32(1), int64(3)
memory usage: 1.4 KB
Entrée [44]: label_encode = LabelEncoder()
labels = label_encode.fit_transform(df['Total Recycled (\'000 tonnes)'])
df['sortie1'] = labels
df.drop(columns=['Total Recycled (\'000 tonnes)'], axis=1, inplace=True)
df.head()
Out[44]:
Total Generated ('000 tonnes) Year sortie sortie1
0 1624 2018 3 36
1 1269 2018 4 18
2 1054 2018 14 31
3 949 2018 15 14
4 763 2018 6 18
Entrée [45]: sns.scatterplot(x=df['sortie1'],y=df['sortie'], hue=df['Year'])
Out[45]: <AxesSubplot:xlabel='sortie1', ylabel='sortie'>
Entrée [50]: # Split the data into features (X) and target (y)
X = df.drop('Year', axis=1)
y = df['Year']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
(36, 3) (36,) (9, 3) (9,)