CMP4293 INTRODUCTION TO AI PRODUCED BY DR.
MARIAM ADEDOYIN-OLOWE
Welcome to the Week 7 lab session where you will continue to work on with the “Insurance.csv”
data. However, you will apply Linear Regression on the data to predict what insurance premium
people will be based on different attributes such as age, BMI, gender and smoking status.
from google.colab import files
file = files.upload()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('insurance.csv')
data.info()
#check for null values
data.isnull().sum()
#check for any duplicated rows
data.duplicated().sum()
#check the original and the duplicated rows
data[data.duplicated(keep=False)]
#drop the duplicated row
data.drop_duplicates(inplace=True)
#check to confirm the duplicated row has been dropped
data.duplicated().sum()
data['sex'].value_counts()
#To convert text columns into number, let's display all the columns
with texts object
display(data['sex'].value_counts())
display(data['smoker'].value_counts())
display(data['region'].value_counts())
#import the relevant sklearn libraries needed to convert the text
columns into numeric values
CMP4293 INTRODUCTION TO AI PRODUCED BY DR. MARIAM ADEDOYIN-OLOWE
from sklearn.preprocessing import LabelEncoder
from Welcome to the Week 7 lab session
sklearn.preprocessing whereOneHotEncoder
import you will continue to work on with the “Insurance.csv”
from data. However, you will apply Linear Regression
sklearn.compose import ColumnTransformer on the data to predict what insurance premium
people will be based on different attributes such as age, BMI, gender and smoking status.
#creating one label encoder for sex and one label encoder for smoker
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
#the fit object fits the specific values into the new columns using
only the 2 values e.g. male, female into 0,1
le_sex.fit(data['sex'].drop_duplicates())
le_smoker.fit(data['smoker'].drop_duplicates())
#applying the encording and saving the results in new columns. Note
that duplicates are not dropped here because we want to transform all
the rows
data['sex_enc'] = le_sex.transform(data['sex'])
data['smoker_enc'] = le_smoker.transform(data['smoker'])
#now let's check the transformation
data.head()
#transforming the 'region' column using the OneHotEncorder and applying
the 'passthrough'
#to the remaining columns so that the transformer leaves them as they
are
ct = ColumnTransformer( [ ('ohe', OneHotEncoder(), ['region']) ],
remainder='passthrough' )
trans = ct.fit_transform(data)
#listing out the new dataframe headers
ins_data = pd.DataFrame(trans, columns=ct.get_feature_names_out())
#listing the new columns
list(ins_data.columns)
ins_data.head()
#rename columns
ins_data.columns = ['region_northeast',
'region_northwest',
CMP4293 INTRODUCTION TO AI PRODUCED BY DR. MARIAM ADEDOYIN-OLOWE
Welcome to the Week 7 lab session where you will continue to work on with the “Insurance.csv”
data. However, you will apply Linear Regression on the data to predict what insurance premium
people will be based on different attributes such as age, BMI, gender and smoking status.
'region_southeast',
'region_southwest',
'age',
'sex',
'bmi',
'children',
'smoker',
'charges',
'sex_enc',
'smoker_enc']
#reorder columns
ins_data = ins_data[[ 'age',
'sex',
'sex_enc',
'bmi',
'children',
'smoker',
'smoker_enc',
'region_northeast',
'region_northwest',
'region_southeast',
'region_southwest',
'charges'
]]
#remove object columns, save into new dataset, and convert to numeric
ins_data_t = ins_data[[ 'age',
'sex_enc',
'bmi',
'children',
'smoker_enc',
'region_northeast',
'region_northwest',
'region_southeast',
'region_southwest',
'charges'
]]
ins_data_t = ins_data_t.apply(pd.to_numeric)
ins_data_t.info()
df_corr = ins_data_t[['age',
CMP4293 INTRODUCTION TO AI PRODUCED BY DR. MARIAM ADEDOYIN-OLOWE
Welcome to the Week 7 lab session where you will continue to work on with the “Insurance.csv”
data. However, you will apply Linear Regression on the data to predict what insurance premium
people will be based on different attributes such as age, BMI, gender and smoking status.
sns.heatmap(df_corr, vmin=-1, vmax=1, annot=True, fmt='.2f')
from sklearn.model_selection import train_test_split
df_feat = ins_data_t [['age',
'sex_enc',
'bmi',
'children',
'smoker_enc',
'charges'
]]
X = df_feat.iloc[:,0:-1]
y = df_feat.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5,
test_size=0.3)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
# y = a + B*X
# a = model.intercept
# B = model.coef_
model.intercept_, model.coef_
y_pred = model.predict(X_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mse = mean_squared_error(y_pred, y_test)
sqrt_mse = np.sqrt(mse)
mae = mean_absolute_error(y_pred, y_test)
print(f"MSE : {mse:.3f}, MSE_SQRT : {sqrt_mse:.3f}, MAE : {mae:.3f}")
r2 = model.score(X_test, y_test)
print(f"R2 score: {r2:.3f}")
df_feat['charges'].min(), df_feat['charges'].max(),
df_feat['charges'].max()- df_feat['charges'].min()
df_feat.columns
val = model.predict([[50,1, 45.9, 1, 0,]])
print('Predicted Insurance Charge =', val)