import numpy as np
import pandas as pd
dataset = pd.read_csv("Data.csv")
dataset
{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n
\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.693792591722527,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
9,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 12265.579661982732,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 9,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
52000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}
dataset["Age"].fillna(np.mean(dataset["Age"]))
0 44.000000
1 27.000000
2 30.000000
3 38.000000
4 40.000000
5 35.000000
6 38.777778
7 48.000000
8 50.000000
9 37.000000
Name: Age, dtype: float64
dataset["Age"] = dataset["Age"].fillna(np.mean(dataset["Age"]))
dataset
{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n
\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.253777219533095,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
10,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 12265.579661982732,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 9,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
52000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}
dataset["Salary"] =
dataset["Salary"].fillna(np.mean(dataset["Salary"]))
dataset
{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n
\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 3,\n \"samples\": [\n
\"France\",\n \"Spain\",\n \"Germany\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 7.253777219533095,\n
\"min\": 27.0,\n \"max\": 50.0,\n \"num_unique_values\":
10,\n \"samples\": [\n 50.0,\n 27.0,\n
35.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 11564.099405562389,\n \"min\": 48000.0,\n
\"max\": 83000.0,\n \"num_unique_values\": 10,\n
\"samples\": [\n 83000.0,\n 48000.0,\n
58000.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Purchased\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Yes\",\n \"No\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"dataset"}
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset["Purchased"] = le.fit_transform(dataset["Purchased"])
dataset["Country"] = le.fit_transform(dataset["Country"])
dataset
{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n
\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 2,\n
\"num_unique_values\": 3,\n \"samples\": [\n 0,\n
2,\n 1\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.253777219533095,\n \"min\": 27.0,\n \"max\":
50.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
50.0,\n 27.0,\n 35.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
11564.099405562389,\n \"min\": 48000.0,\n \"max\":
83000.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
83000.0,\n 48000.0,\n 58000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Purchased\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 1,\n
\"num_unique_values\": 2,\n \"samples\": [\n 1,\n
0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"dataset"}
dataset.iloc[:,:-1]
{"summary":"{\n \"name\": \"dataset\",\n \"rows\": 10,\n
\"fields\": [\n {\n \"column\": \"Country\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 0,\n \"max\": 2,\n
\"num_unique_values\": 3,\n \"samples\": [\n 0,\n
2,\n 1\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.253777219533095,\n \"min\": 27.0,\n \"max\":
50.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
50.0,\n 27.0,\n 35.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
11564.099405562389,\n \"min\": 48000.0,\n \"max\":
83000.0,\n \"num_unique_values\": 10,\n \"samples\": [\n
83000.0,\n 48000.0,\n 58000.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe"}
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,:-
1], dataset["Purchased"], test_size=0.2)
# print(x_train, x_test, y_train, y_test)
y_train
2 0
8 0
4 1
6 0
5 1
0 0
1 1
9 1
Name: Purchased, dtype: int64
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
LinearRegression()
x_test
{"summary":"{\n \"name\": \"x_test\",\n \"rows\": 2,\n \"fields\":
[\n {\n \"column\": \"Country\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 2,\n \"num_unique_values\": 2,\n \"samples\":
[\n 2,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Age\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 7.0710678118654755,\n \"min\":
38.0,\n \"max\": 48.0,\n \"num_unique_values\": 2,\n
\"samples\": [\n 38.0,\n 48.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
12727.922061357855,\n \"min\": 61000.0,\n \"max\":
79000.0,\n \"num_unique_values\": 2,\n \"samples\": [\n
61000.0,\n 79000.0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"x_test"}
predict = lr.predict(x_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, predict)
0.29618249533841706
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=150)
rfc.fit(x_train,y_train)
RandomForestClassifier(max_depth=150)
rfc.predict(x_test)
array([0, 1])
x_test
{"summary":"{\n \"name\": \"x_test\",\n \"rows\": 2,\n \"fields\":
[\n {\n \"column\": \"Country\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 2,\n \"num_unique_values\": 2,\n \"samples\":
[\n 2,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"Age\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 7.0710678118654755,\n \"min\":
38.0,\n \"max\": 48.0,\n \"num_unique_values\": 2,\n
\"samples\": [\n 38.0,\n 48.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Salary\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\":
12727.922061357855,\n \"min\": 61000.0,\n \"max\":
79000.0,\n \"num_unique_values\": 2,\n \"samples\": [\n
61000.0,\n 79000.0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"x_test"}
y_test
7 1
3 0
Name: Purchased, dtype: int64