In [1]: # Supress Warnings
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
housing = pd.read_csv("Housing.csv")
# Check the head of the dataset
housing.head()
housing.shape
housing.info()
housing.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
price 545 non-null int64
area 545 non-null int64
bedrooms 545 non-null int64
bathrooms 545 non-null int64
stories 545 non-null int64
mainroad 545 non-null object
guestroom 545 non-null object
basement 545 non-null object
hotwaterheating 545 non-null object
airconditioning 545 non-null object
parking 545 non-null int64
prefarea 545 non-null object
furnishingstatus 545 non-null object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
Out[1]:
price area bedrooms bathrooms stories parking
count 5.450000e+02 545.000000 545.000000 545.000000 545.000000 545.000000
mean 4.766729e+06 5150.541284 2.965138 1.286239 1.805505 0.693578
std 1.870440e+06 2170.141023 0.738064 0.502470 0.867492 0.861586
min 1.750000e+06 1650.000000 1.000000 1.000000 1.000000 0.000000
25% 3.430000e+06 3600.000000 2.000000 1.000000 1.000000 0.000000
50% 4.340000e+06 4600.000000 3.000000 1.000000 2.000000 0.000000
75% 5.740000e+06 6360.000000 3.000000 2.000000 2.000000 1.000000
max 1.330000e+07 16200.000000 6.000000 4.000000 4.000000 3.000000
In [2]: import matplotlib.pyplot as plt
import seaborn as sns
In [3]: sns.pairplot(housing)
plt.show()
In [4]: plt.figure(figsize=(20, 12))
plt.subplot(2,3,1)
sns.boxplot(x = 'mainroad', y = 'price', data = housing)
plt.subplot(2,3,2)
sns.boxplot(x = 'guestroom', y = 'price', data = housing)
plt.subplot(2,3,3)
sns.boxplot(x = 'basement', y = 'price', data = housing)
plt.subplot(2,3,4)
sns.boxplot(x = 'hotwaterheating', y = 'price', data = housing)
plt.subplot(2,3,5)
sns.boxplot(x = 'airconditioning', y = 'price', data = housing)
plt.subplot(2,3,6)
sns.boxplot(x = 'furnishingstatus', y = 'price', data = housing)
plt.show()
In [5]: plt.figure(figsize = (10, 5))
sns.boxplot(x = 'furnishingstatus', y = 'price', hue =
'airconditioning', data = housing)
plt.show()
In [6]: # List of variables to map
varlist = ['mainroad', 'guestroom', 'basement', 'hotwaterheating',
'airconditioning', 'prefarea']
# Defining the map function
def binary_map(x):
return x.map({'yes': 1, "no": 0})
# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)
# Check the housing dataframe now
housing.head()
Out[6]:
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditioning parking prefarea furnishingstatus
0 13300000 7420 4 2 3 1 0 0 0 1 2 1 furnished
1 12250000 8960 4 4 4 1 0 0 0 1 3 0 furnished
2 12250000 9960 3 2 2 1 0 1 0 0 2 1 semi-furnished
3 12215000 7500 4 2 2 1 0 1 0 1 3 1 furnished
4 11410000 7420 4 1 2 1 1 1 0 1 2 0 furnished
In [8]: # Get the dummy variables for the feature 'furnishingstatus' and storeit in a new variable - 'status'
status = pd.get_dummies(housing['furnishingstatus'])
# Check what the dataset 'status' looks like
status.head()
Out[8]:
furnished semi-furnished unfurnished
0 1 0 0
1 1 0 0
2 0 1 0
3 1 0 0
4 1 0 0
In [9]: # Let's drop the first column from status df using 'drop_first = True'
status = pd.get_dummies(housing['furnishingstatus'], drop_first =
True)
# Add the results to the original housing dataframe
housing = pd.concat([housing, status], axis = 1)
# Now let's see the head of our dataframe.
housing.head()
# Drop 'furnishingstatus' as we have created the dummies for it
housing.drop(['furnishingstatus'], axis = 1, inplace = True)
housing.head()
Out[9]:
semi-
price area bedrooms bathrooms stories mainroad guestroom basement hotwaterheating airconditioning parking prefarea unfurnished
furnished
0 13300000 7420 4 2 3 1 0 0 0 1 2 1 0 0
1 12250000 8960 4 4 4 1 0 0 0 1 3 0 0 0
2 12250000 9960 3 2 2 1 0 1 0 0 2 1 1 0
3 12215000 7500 4 2 2 1 0 1 0 1 3 1 0 0
4 11410000 7420 4 1 2 1 1 1 0 1 2 0 0 0
In [16]: from sklearn.model_selection import train_test_split
# We specify this so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(housing, train_size = 0.7,
test_size = 0.3, random_state = 100)
In [18]: from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories',
'parking','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()
# Let's check the correlation coefficients to see which variables are highly correlated
plt.figure(figsize = (16, 10))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()
In [19]: plt.figure(figsize=[6,6])
plt.scatter(df_train.area, df_train.price)
plt.show()
In [20]: y_train = df_train.pop('price')
X_train = df_train
In [21]: import statsmodels.api as sm
# Add a constant
X_train_lm = sm.add_constant(X_train[['area']])
# Create a first fitted model
lr = sm.OLS(y_train, X_train_lm).fit()
# Check the parameters obtained
lr.params
# Let's visualise the data with a scatter plot and the fitted regression line
plt.scatter(X_train_lm.iloc[:, 1], y_train)
plt.plot(X_train_lm.iloc[:, 1], 0.127 + 0.462*X_train_lm.iloc[:, 1],
'r')
plt.show()
# Print a summary of the linear regression model obtained
print(lr.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.283
Model: OLS Adj. R-squared: 0.281
Method: Least Squares F-statistic: 149.6
Date: Sat, 12 Apr 2025 Prob (F-statistic): 3.15e-29
Time: 09:46:42 Log-Likelihood: 227.23
No. Observations: 381 AIC: -450.5
Df Residuals: 379 BIC: -442.6
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.1269 0.013 9.853 0.000 0.102 0.152
area 0.4622 0.038 12.232 0.000 0.388 0.536
==============================================================================
Omnibus: 67.313 Durbin-Watson: 2.018
Prob(Omnibus): 0.000 Jarque-Bera (JB): 143.063
Skew: 0.925 Prob(JB): 8.59e-32
Kurtosis: 5.365 Cond. No. 5.99
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [22]: # Assign all the feature variables to X
X_train_lm = X_train[['area', 'bathrooms']]
# Build a linear model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train_lm)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.params
# Check the summary
print(lr.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.480
Model: OLS Adj. R-squared: 0.477
Method: Least Squares F-statistic: 174.1
Date: Sat, 12 Apr 2025 Prob (F-statistic): 2.51e-54
Time: 09:47:12 Log-Likelihood: 288.24
No. Observations: 381 AIC: -570.5
Df Residuals: 378 BIC: -558.6
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.1046 0.011 9.384 0.000 0.083 0.127
area 0.3984 0.033 12.192 0.000 0.334 0.463
bathrooms 0.2984 0.025 11.945 0.000 0.249 0.347
==============================================================================
Omnibus: 62.839 Durbin-Watson: 2.157
Prob(Omnibus): 0.000 Jarque-Bera (JB): 168.790
Skew: 0.784 Prob(JB): 2.23e-37
Kurtosis: 5.859 Cond. No. 6.17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [23]: # Assign all the feature variables to X
X_train_lm = X_train[['area', 'bathrooms','bedrooms']]
# Build a linear model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train_lm)
lr = sm.OLS(y_train, X_train_lm).fit()
lr.params
# Print the summary of the model
print(lr.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.505
Model: OLS Adj. R-squared: 0.501
Method: Least Squares F-statistic: 128.2
Date: Sat, 12 Apr 2025 Prob (F-statistic): 3.12e-57
Time: 09:47:38 Log-Likelihood: 297.76
No. Observations: 381 AIC: -587.5
Df Residuals: 377 BIC: -571.7
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.0414 0.018 2.292 0.022 0.006 0.077
area 0.3922 0.032 12.279 0.000 0.329 0.455
bathrooms 0.2600 0.026 10.033 0.000 0.209 0.311
bedrooms 0.1819 0.041 4.396 0.000 0.101 0.263
==============================================================================
Omnibus: 50.037 Durbin-Watson: 2.136
Prob(Omnibus): 0.000 Jarque-Bera (JB): 124.806
Skew: 0.648 Prob(JB): 7.92e-28
Kurtosis: 5.487 Cond. No. 8.87
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [29]: # Check all the columns of the dataframe
housing.columns
Out[29]: Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
'parking', 'prefarea', 'semi-furnished', 'unfurnished'],
dtype='object')
In [31]: #Build a linear model
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)
lr_1 = sm.OLS(y_train, X_train_lm).fit()
lr_1.params
print(lr_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.681
Model: OLS Adj. R-squared: 0.670
Method: Least Squares F-statistic: 60.40
Date: Sat, 12 Apr 2025 Prob (F-statistic): 8.83e-83
Time: 09:53:02 Log-Likelihood: 381.79
No. Observations: 381 AIC: -735.6
Df Residuals: 367 BIC: -680.4
Df Model: 13
Covariance Type: nonrobust
===================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------
const 0.0200 0.021 0.955 0.340 -0.021 0.061
area 0.2347 0.030 7.795 0.000 0.175 0.294
bedrooms 0.0467 0.037 1.267 0.206 -0.026 0.119
bathrooms 0.1908 0.022 8.679 0.000 0.148 0.234
stories 0.1085 0.019 5.661 0.000 0.071 0.146
mainroad 0.0504 0.014 3.520 0.000 0.022 0.079
guestroom 0.0304 0.014 2.233 0.026 0.004 0.057
basement 0.0216 0.011 1.943 0.053 -0.000 0.043
hotwaterheating 0.0849 0.022 3.934 0.000 0.042 0.127
airconditioning 0.0669 0.011 5.899 0.000 0.045 0.089
parking 0.0607 0.018 3.365 0.001 0.025 0.096
prefarea 0.0594 0.012 5.040 0.000 0.036 0.083
semi-furnished 0.0009 0.012 0.078 0.938 -0.022 0.024
unfurnished -0.0310 0.013 -2.440 0.015 -0.056 -0.006
==============================================================================
Omnibus: 93.687 Durbin-Watson: 2.093
Prob(Omnibus): 0.000 Jarque-Bera (JB): 304.917
Skew: 1.091 Prob(JB): 6.14e-67
Kurtosis: 6.801 Cond. No. 14.6
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [51]: # Check for the VIF values of the feature variables.
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in
range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
Out[51]:
Features VIF
1 bedrooms 7.33
4 mainroad 6.02
0 area 4.67
3 stories 2.70
11 semi-furnished 2.19
9 parking 2.12
6 basement 2.02
12 unfurnished 1.82
8 airconditioning 1.77
2 bathrooms 1.67
10 prefarea 1.51
5 guestroom 1.47
7 hotwaterheating 1.14
In [ ]:
In [ ]: