14/11/2023 00:10 Scikit-learn1 - Jupyter Notebook
Entrée [22]: import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
wine_data = load_wine()
Entrée [3]: wine_data.data
Out[3]: array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
1.185e+03],
...,
[1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
8.350e+02],
[1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
8.400e+02],
[1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
5.600e+02]])
Entrée [4]: # Convert data to pandas dataframe
wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
Entrée [5]: # Add the target label
wine_df["target"] = wine_data.target
Entrée [6]: # Take a preview
wine_df.head()
Out[6]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid
0 14.23 1.71 2.43 15.6 127.0 2.80 3.06
1 13.20 1.78 2.14 11.2 100.0 2.65 2.76
2 13.16 2.36 2.67 18.6 101.0 2.80 3.24
3 14.37 1.95 2.50 16.8 113.0 3.85 3.49
4 13.24 2.59 2.87 21.0 118.0 2.80 2.69
localhost:8888/notebooks/Scikit-learn1.ipynb 1/4
14/11/2023 00:10 Scikit-learn1 - Jupyter Notebook
Entrée [7]: wine_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 alcohol 178 non-null float64
1 malic_acid 178 non-null float64
2 ash 178 non-null float64
3 alcalinity_of_ash 178 non-null float64
4 magnesium 178 non-null float64
5 total_phenols 178 non-null float64
6 flavanoids 178 non-null float64
7 nonflavanoid_phenols 178 non-null float64
8 proanthocyanins 178 non-null float64
9 color_intensity 178 non-null float64
10 hue 178 non-null float64
11 od280/od315_of_diluted_wines 178 non-null float64
12 proline 178 non-null float64
13 target 178 non-null int32
dtypes: float64(13), int32(1)
memory usage: 18.9 KB
Entrée [8]: wine_df.describe()
Out[8]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoid
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.00000
mean 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.02927
std 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.99885
min 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.34000
25% 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.20500
50% 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.13500
75% 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.87500
max 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.08000
Entrée [9]: wine_df.tail()
Out[9]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavan
173 13.71 5.65 2.45 20.5 95.0 1.68 0.61
174 13.40 3.91 2.48 23.0 102.0 1.80 0.75
175 13.27 4.28 2.26 20.0 120.0 1.59 0.69
176 13.17 2.59 2.37 20.0 120.0 1.65 0.68
177 14.13 4.10 2.74 24.5 96.0 2.05 0.76
localhost:8888/notebooks/Scikit-learn1.ipynb 2/4
14/11/2023 00:10 Scikit-learn1 - Jupyter Notebook
Entrée [11]: # Split data into features and label
X = wine_df[wine_data.feature_names].copy()
y = wine_df["target"].copy()
Entrée [12]: # Instantiate scaler and fit on features
scaler = StandardScaler()
scaler.fit(X)
Out[12]: StandardScaler()
Entrée [13]: # Transform features
X_scaled = scaler.transform(X.values)
Entrée [14]: # View first instance
print(X_scaled[0])
[ 1.51861254 -0.5622498 0.23205254 -1.16959318 1.91390522 0.80899739
1.03481896 -0.65956311 1.22488398 0.25171685 0.36217728 1.84791957
1.01300893]
Entrée [16]: # Split data into train and test
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y,
Entrée [17]: # Check the splits are correct
print(f"Train size: {round(len(X_train_scaled) / len(X) * 100)}% \n\
Test size: {round(len(X_test_scaled) / len(X) * 100)}%")
Train size: 70%
Test size: 30%
Entrée [19]: # Instnatiating the models
logistic_regression = LogisticRegression()
svm = SVC()
tree = DecisionTreeClassifier()
Entrée [20]: # Training the models
logistic_regression.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
tree.fit(X_train_scaled, y_train)
Out[20]: DecisionTreeClassifier()
Entrée [21]: # Making predictions with each model
log_reg_preds = logistic_regression.predict(X_test_scaled)
svm_preds = svm.predict(X_test_scaled)
tree_preds = tree.predict(X_test_scaled)
localhost:8888/notebooks/Scikit-learn1.ipynb 3/4
14/11/2023 00:10 Scikit-learn1 - Jupyter Notebook
Entrée [23]: # Store model predictions in a dictionary
# this makes it's easier to iterate through each model
# and print the results.
model_preds = {
"Logistic Regression": log_reg_preds,
"Support Vector Machine": svm_preds,
"Decision Tree": tree_preds
}
Entrée [24]: for model, preds in model_preds.items():
print(f"{model} Results:\n{classification_report(y_test, preds)}", sep="\n\
Logistic Regression Results:
precision recall f1-score support
0 1.00 1.00 1.00 17
1 1.00 0.92 0.96 25
2 0.86 1.00 0.92 12
accuracy 0.96 54
macro avg 0.95 0.97 0.96 54
weighted avg 0.97 0.96 0.96 54
Support Vector Machine Results:
precision recall f1-score support
0 1.00 1.00 1.00 17
1 1.00 1.00 1.00 25
2 1.00 1.00 1.00 12
accuracy 1.00 54
macro avg 1.00 1.00 1.00 54
weighted avg 1.00 1.00 1.00 54
Decision Tree Results:
precision recall f1-score support
0 0.94 0.94 0.94 17
1 0.92 0.92 0.92 25
2 0.92 0.92 0.92 12
accuracy 0.93 54
macro avg 0.93 0.93 0.93 54
weighted avg 0.93 0.93 0.93 54
Entrée [ ]:
localhost:8888/notebooks/Scikit-learn1.ipynb 4/4