Course: BCSE424L_ ML for Robotics
Date: 12.01.2024
Name: Harisankar R N R
Reg No: 21BRS1524
Lab1: Linear and Multilinear Regression
In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0) data = pd.read_csv("C:/Users/91812/21BRS1518/headbrain.csv")
#Download the dataset and give the appropriate path print(data.shape) data.head()
(237, 4)
Out[1]: Gender Age Range Head Size(cm^3) Brain Weight(grams)
0 1 1 4512 1530
1 1 1 3738 1297
2 1 1 4261 1335
3 1 1 3777 1282
4 1 1 4177 1590
In [2]:
X=data['Head Size(cm^3)'].values
Y=data['Brain Weight(grams)'].values
mean_x=np.mean(X)
mean_y=np.mean(Y) n=len(X)
#Total number of values numer=0
denom=0 for i in range(n):
numer+=(X[i]-mean_x) * (Y[i] - mean_y)
denom+=(X[i]-mean_x) ** 2
b1=numer/denom b0=mean_y -
(b1*mean_x) print(b1,b0) #Print
cooefficients
In [3]:
0.26342933948939945 325.57342104944223
In [4]:
max_x=np.max(X)+100 min_x=np.min(X)-100 x=np.linspace(min_x, max_x, 1000)
y=b0+b1*x plt.plot(x,y,color='#58b970', label='Regression Line') #Ploting line
plt.scatter(X, Y, c='#ef5423', label='Scatter Plot') #Ploting Scatter Points
plt.xlabel('Head Size in cm3') plt.ylabel('Brain Weight in grams') plt.legend()
plt.show()
In [5]:
ss_t=0 ss_r=0 for i in
range(n): y_pred=b0+b1 *
X[i] ss_t+=(Y[i]-mean_y)
** 2 ss_r+=(Y[i]-y_pred)
** 2
r2=1-(ss_r/ss_t)
print(r2)
0.6393117199570003
In [ ]:
#IMPORTING THE DEPENDENCIES import
numpy as np import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split from
sklearn.linear_model import LinearRegression from sklearn
import metrics import warnings
warnings.filterwarnings('ignore')
In [1]:
Out[4]:
Out[5]:
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
# Column Non-Null Count Dtype
0 age 1338 non-null int64
1 sex 1338 non-null object
2 bmi 1338 non-null float64
3 children 1338 non-null int64
4 smoker 1338 non-null object
5 region 1338 non-null object
6 charges 1338 non-null float64 dtypes: float64(2), int64(2),
object(3) memory usage: 73.3+ KB
#check for missing values
#--- incase null values are found then the decision need to be taken to drop the rows or the entire columns insurance_dataset.isnull().sum()
In [8]:
age 0
Out[8]:
sex 0
bmi 0
children 0
smoker 0
region 0
charges 0
dtype: int64
In [9]: #statistical Measures of the dataset
insurance_dataset .describe ()
Out[9]: age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000
5.000000 63770.428010
#insurance_dataset['age'].plot(kind='box', title='Insurance Info') labels =
['age', 'bmi', 'children']
B = plt.boxplot([insurance_dataset['age'], insurance_dataset['bmi'], insurance_dataset['children']], labels=labels) plt.show()
In [10]:
#distribution of 'age' value sns.set()
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['age'])
plt.title('Age Distribution')
plt.show()
In [11]:
#gender column plt.figure(figsize=(6,6))
sns.countplot(x='sex',data=insurance_dataset)
plt.title('Sex Distribution')
plt.show()
In [12]:
Name: sex, dtype: int64
#BMI Distribution sns.set()
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['bmi'])
plt.title('Age Distribution')
plt.show()
In [14]:
# children column plt.figure(figsize=(6,6))
sns.countplot(x='children',data=insurance_dataset)
plt.title('Children') plt.show()
In [15]:
1 324
2 240
3 157
4 25
5 18
Name: children, dtype: int64
# smoker column plt.figure(figsize=(6,6))
sns.countplot(x='smoker',data=insurance_dataset)
plt.title('Smoker') plt.show()
In [17]:
yes 274
Name: smoker, dtype: int64
#region column plt.figure(figsize=(6,6))
sns.countplot(x='region',data=insurance_dataset)
plt.title('Region') plt.show()
In [19]:
northwest 325
northeast 324
Name: region, dtype: int64
#distribution of 'charges' value
sns.set() plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['charges'])
plt.title('Charge Distribution') plt.show()
In [21]:
#encoding 'sex' column insurance_dataset.replace({'sex':{'male':0,'female':1}},inplace=True)
#encoding 'smoker' column insurance_dataset.replace({'smoker':{'yes':0,'no':1}},inplace=True)
#encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}},inplace=True)
In [22]:
print(insurance_dataset)
In [23]:
age sex bmi children smoker region charges
0 19 1 27.900 0
0 1 16884.92400
1 18 0 33.770 1
1 0 1725.55230
2 28 0 33.000 3
1 0 4449.46200
3 33 0 22.705 0
1 3 21984.47061 4 32
0 28.880 0
1 3 3866.85520
In [13]: insurance_dataset ['sex' ].value_counts ()
Out[13]: male 676
female 662
... ... ... ... ... ... ... ...
1333 50 0 30.970 3
1 3 10600.54830
1334 18 1 31.920 0
1 2 2205.98080
1335 18 1 36.850 0
1 0 1629.83350
1336 21 1 25.800 0
1 1 2007.94500
1337 61 1 29.070 0
0 3 29141.36030
print(Y)
In [16]: insurance_dataset ['children' ].value_counts ()
Out[16]: 0 574
In [26]:
In [18]: insurance_dataset ['smoker' ] .value_counts ()
Out[18]: no 1064
In [20]: insurance_dataset ['region' ] .value_counts ()
Out[20]: southeast 364
southwest 325
0 16884.92400
1 1725.55230
2 4449.46200
3 21984.47061
4 3866.85520
...
10600.54830
1333
1334 2205.98080
1335 1629.83350
1336 2007.94500
1337 29141.36030
# R squared value for testing data r2_test=metrics.r2_score(Y_test,testing_data_prediction)
print("R-squared value",r2_test)
[1338 rows x 7 columns]
In [24]: X=insurance_dataset .drop (columns ='charges' ,axis =1)
Y=insurance_dataset ['charges' ]
print(X)
In [25]:
age sex bmi children smoker region
0 19 1 27.900 0 0 1
1 18 0 33.770 1 1 0
28 0 33.000 3 1 0
33 0 22.705 0 1 3
32 0 28.880 0 1 3
... ... ... ... ... ...
50 0 30.970 3 1 3
18 1 31.920 0 1 2
2
0 1 0
18 1 36.850
0 1 1
21 1 25.800
0 0 3
61 1 29.070
3
4
...
1333
1334
1335
1336
1337
[1338 rows x 6 columns]
input_data=(31,1,25.74,0,1,0)
#changing input_data to a numpy array input_data_array=np.asarray(input_data)
In [33]:
R-squared value 0.7447273869684077
In [36]:
[3760.0805765]
The insurance cost is USD 3760.0805764960514
In [ ]: