Input:
setwd("C:\\Users\\Baba\\Desktop\\Advance Stat")
library(readxl)
mydataset<-read_xlsx("Dataset_LeslieSalt.xlsx")
summary(mydataset)
str(mydataset)
attach(mydataset)
#Converting country and flood into factors
mydataset$County<-factor(mydataset$County, levels=c("0","1"), labels = c("San Meteo", "Sant
Clara"))
mydataset$Flood<-factor(mydataset$Flood,levels = c("0","1"),labels = c("Flood-Yes","Flood-No"))
##Load the psych and lmtest Libraries
library(psych)
library(lmtest)
library(zoo)
describe(mydataset)
##identifying outliers
boxplot(mydataset$Price, main="Boxplot for price")
#treating outlier for price
mydataset<-mydataset[-26,]
#indentifying correlation
mydataset_cor <- as.matrix(dplyr::select_if(mydataset, is.numeric))
corrplot(cor(mydataset_cor),order="hclust",main="corelation plot")
#observation- high correlation between price and date followed by Elevation and low correlation
with size and distance
#considering multicolinertity between independent variables size and distance are correlated with
each other
#running regression model
model1 <- lm(Price ~ ., data= mydataset)
summary(model1)
anova(model1)
#removing correlated independent variables
model2<-lm(Price~.-Distance -Size,data=mydataset)
summary(model2)
#P value is less than 0.05 and r square value is same as model 1 thus accepting the model
##Extract the fitted values and residual values from the model2 output
fitted(model2)
residuals(model2)
fit2 <- fitted(model2)
res2 <- residuals(model2)
##Merge the fitted and residual values with Consumer dataset for comparison sake
dataset_reg <- cbind(mydataset, fit2, res2)
##Plot the actual versus fitted values in a plot
with(dataset_reg, plot(fit2,res2, pch=19, cex=0.6))
abline(a=0,b=0)
#prediction for problem statement
#county=Sant Clara
#Elevation=no elevation=0
#sewer= no clear from problem statement hence keeping it 0
#Date=2 months,4 months, 6months
#flood= no flood
#distance=0
mydataset2<-data.frame(Price=0,County="Sant
Clara",Size=246.8,Elevation=0,Sewer=0,Date=6,Flood="Flood-No",Distance=0)
mydataset2$price_predection<-predict(model2,mydataset2)*1000
mydataset2$price_predection
##Multivariate normality Test
with(mydataset, shapiro.test(Size))
with(mydataset, shapiro.test(Elevation))
with(mydataset, shapiro.test(Sewer))
with(mydataset, shapiro.test(Date))
with(mydataset, shapiro.test(Distance))
library(car)
#VIF - Variation inflation factor
vif(model2)
#Durbin Watson Test to test Auto Correlation
##Null Hypothesis states that there is No auto-correlation
#Alternate Hypothesis states there is autocorrelation
dwtest(model2)
## Homoscedasticity tested using Goldfelt Quant test
#Null hypothesis : Data satisfies the condiction of homoscedasticity
##Alternate hypothesis states data is not Homoscedastic
gqtest(model2)
output:
> setwd("C:\\Users\\Baba\\Desktop\\Advance Stat")
> library(readxl)
> mydataset<-read_xlsx("Dataset_LeslieSalt.xlsx")
> summary(mydataset)
Price County Size Elevation
Sewer Date Flood
Min. : 1.70 Min. :0.0000 Min. : 6.90 Min. : 0.000 Min.
: 0 Min. :-103.00 Min. :0.0000
1st Qu.: 5.35 1st Qu.:0.0000 1st Qu.: 20.35 1st Qu.: 2.000 1st
Qu.: 0 1st Qu.: -63.50 1st Qu.:0.0000
Median :11.70 Median :1.0000 Median : 51.40 Median : 4.000
Median : 900 Median : -59.00 Median :0.0000
Mean :11.95 Mean :0.6129 Mean : 139.97 Mean : 4.645 Mean
: 1981 Mean : -58.65 Mean :0.1613
3rd Qu.:16.05 3rd Qu.:1.0000 3rd Qu.: 104.10 3rd Qu.: 7.000 3rd
Qu.: 3450 3rd Qu.: -51.00 3rd Qu.:0.0000
Max. :37.20 Max. :1.0000 Max. :1695.20 Max. :20.000 Max.
:10000 Max. : -4.00 Max. :1.0000
Distance
Min. : 0.000
1st Qu.: 0.850
Median : 4.900
Mean : 5.132
3rd Qu.: 5.500
Max. :16.500
> str(mydataset)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31 obs. of 8 variables:
$ Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
$ County : num 1 1 0 0 0 1 1 1 1 1 ...
$ Size : num 138.4 52 16.1 1695.2 845 ...
$ Elevation: num 10 4 0 1 1 2 4 4 20 0 ...
$ Sewer : num 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
$ Date : num -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
$ Flood : num 0 0 1 0 1 0 0 0 0 0 ...
$ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
> attach(mydataset)
> #Converting country Aand flood into factors
> mydataset$County<-factor(mydataset$County, levels=c("0","1"), labels =
c("San Meteo", "Sant Clara"))
> mydataset$Flood<-factor(mydataset$Flood,levels = c("0","1"),labels =
c("Flood-Yes","Flood-No"))
> ##Load the psych and lmtest Libraries
> library(psych)
> library(lmtest)
> library(zoo)
> describe(mydataset)
vars n mean sd median trimmed mad min max
range skew kurtosis se
Price 1 31 11.95 7.71 11.7 11.21 8.90 1.7 37.2
35.5 1.03 1.39 1.39
County* 2 31 1.61 0.50 2.0 1.64 0.00 1.0 2.0
1.0 -0.44 -1.86 0.09
Size 3 31 139.97 327.17 51.4 57.98 52.34 6.9 1695.2
1688.3 3.77 14.18 58.76
Elevation 4 31 4.65 4.36 4.0 4.12 2.97 0.0 20.0
20.0 1.43 2.61 0.78
Sewer 5 31 1981.29 2481.31 900.0 1576.80 1334.34 0.0 10000.0
10000.0 1.29 1.30 445.66
Date 6 31 -58.65 24.53 -59.0 -59.56 8.90 -103.0 -4.0
99.0 0.20 0.08 4.41
Flood* 7 31 1.16 0.37 1.0 1.08 0.00 1.0 2.0
1.0 1.75 1.11 0.07
Distance 8 31 5.13 4.54 4.9 4.58 3.56 0.0 16.5
16.5 0.81 -0.10 0.81
> ##identifying outliers
> boxplot(mydataset$Price, main="Boxplot for price")
> #treating outlier for price
> mydataset<-mydataset[-26,]
> #indentifying correlation
> mydataset_cor <- as.matrix(dplyr::select_if(mydataset, is.numeric))
> corrplot(cor(mydataset_cor),order="hclust",main="corelation plot")
> #running regression model
> model1 <- lm(Price ~ ., data= mydataset)
> summary(model1)
Call:
lm(formula = Price ~ ., data = mydataset)
Residuals:
Min 1Q Median 3Q Max
-3.7059 -2.6043 -0.3876 2.2315 4.7774
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.6267827 2.9067195 6.408 1.9e-06 ***
CountySant Clara -2.6365930 2.8842949 -0.914 0.37056
Size -0.0034320 0.0025420 -1.350 0.19070
Elevation 0.5407713 0.1693998 3.192 0.00421 **
Sewer -0.0005078 0.0003100 -1.638 0.11563
Date 0.1279277 0.0356334 3.590 0.00163 **
FloodFlood-No -7.8400025 2.2885764 -3.426 0.00242 **
Distance 0.4097406 0.2453188 1.670 0.10904
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.145 on 22 degrees of freedom
Multiple R-squared: 0.8069, Adjusted R-squared: 0.7454
F-statistic: 13.13 on 7 and 22 DF, p-value: 1.493e-06
> anova(model1)
Analysis of Variance Table
Response: Price
Df Sum Sq Mean Sq F value Pr(>F)
County 1 3.432 3.432 0.3470 0.561832
Size 1 93.775 93.775 9.4796 0.005489 **
Elevation 1 300.884 300.884 30.4159 1.528e-05 ***
Sewer 1 50.352 50.352 5.0900 0.034341 *
Date 1 292.905 292.905 29.6094 1.822e-05 ***
Flood 1 140.192 140.192 14.1718 0.001069 **
Distance 1 27.596 27.596 2.7897 0.109040
Residuals 22 217.631 9.892
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> #removing correlated independent variables
> model2<-lm(Price~.-Distance -Size,data=mydataset)
> summary(model2)
Call:
lm(formula = Price ~ . - Distance - Size, data = mydataset)
Residuals:
Min 1Q Median 3Q Max
-5.0186 -2.2651 -0.3114 2.1549 5.1596
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 22.0187525 1.9634490 11.214 5.01e-11 ***
CountySant Clara -4.4613706 1.8189990 -2.453 0.02183 *
Elevation 0.5086667 0.1726287 2.947 0.00704 **
Sewer -0.0006846 0.0002789 -2.455 0.02173 *
Date 0.1308357 0.0276699 4.728 8.28e-05 ***
FloodFlood-No -7.6795702 2.1524916 -3.568 0.00156 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.252 on 24 degrees of freedom
Multiple R-squared: 0.7747, Adjusted R-squared: 0.7278
F-statistic: 16.51 on 5 and 24 DF, p-value: 4.372e-07
> ##Extract the fitted values and residual values from the model2 output
> fitted(model2)
1 2 3 4 5 6
7 8 9 10 11
7.1142451 6.1159761 -0.2899951 7.9636838 2.1263917 0.4770790
10.6952240 11.2185666 18.5981189 5.3381093 5.4689450
12 13 14 15 16 17
18 19 20 21 22
8.1526461 8.9585266 13.9074116 14.9247449 14.4160783 13.9074116
12.8900783 15.4334116 13.9074116 12.2153192 13.0355899
23 24 25 26 27 28
29 30
11.6404255 7.0245936 8.8527918 16.4336650 16.6495546 20.1411536
14.0862180 21.8966239
> residuals(model2)
1 2 3 4 5
6 7 8 9
-2.614245145 4.484023886 1.989995150 -2.963683786 2.873608286
2.822921024 -4.995224024 -5.018566643 0.801881101
10 11 12 13 14
15 16 17 18
-2.138109322 -0.768944976 -1.252646125 -0.858526620 -2.307411589
4.375255075 -2.716078257 -0.607411589 2.209921748
19 20 21 22 23
24 25 26 27
-3.033411594 1.392588411 -0.015319150 5.064410143 5.159574493
-1.124593556 -4.852791848 1.766335026 -1.549554634
28 29 30
2.758846438 1.113781968 0.003376106
> fit2 <- fitted(model2)
> res2 <- residuals(model2)
> ##Merge the fitted and residual values with Consumer dataset for
comparison sake
> dataset_reg <- cbind(mydataset, fit2, res2)
> ##Plot the actual versus fitted values in a plot
> with(dataset_reg, plot(fit2,res2, pch=19, cex=0.6))
> abline(a=0,b=0)
> mydataset2<-data.frame(Price=0,County="Sant
Clara",Size=246.8,Elevation=0,Sewer=0,Date=6,Flood="Flood-No",Distance=0)
> mydataset2$price_predection<-predict(model2,mydataset2)*1000
> mydataset2$price_predection
[1] 10662.83
> ##Multivariate normality Test
> with(mydataset, shapiro.test(Size))
Shapiro-Wilk normality test
data: Size
W = 0.41142, p-value = 6.968e-10
> with(mydataset, shapiro.test(Elevation))
Shapiro-Wilk normality test
data: Elevation
W = 0.85716, p-value = 0.0008791
> with(mydataset, shapiro.test(Sewer))
Shapiro-Wilk normality test
data: Sewer
W = 0.81081, p-value = 0.0001039
> with(mydataset, shapiro.test(Date))
Shapiro-Wilk normality test
data: Date
W = 0.90551, p-value = 0.01149
> with(mydataset, shapiro.test(Distance))
Shapiro-Wilk normality test
data: Distance
W = 0.866, p-value = 0.001366
> library(car)
> #VIF - Variation inflation factor
> vif(model2)
County Elevation Sewer Date Flood
2.179288 1.603015 1.328278 1.277447 1.825141
> #Durbin Watson Test to test Auto Correlation
> ##Null Hypothesis states that there is No auto-correlation
> #Alternate Hypothesis states there is autocorrelation
> dwtest(model2)
Durbin-Watson test
data: model2
DW = 2.112, p-value = 0.491
alternative hypothesis: true autocorrelation is greater than 0
> ## Homoscedasticity tested using Goldfelt Quant test
> #Null hypothesis : Data satisfies the condiction of homoscedasticity
> ##Alternate hypothesis states data is not Homoscedastic
> gqtest(model2)
Goldfeld-Quandt test
data: model2
GQ = 0.17811, df1 = 9, df2 = 9, p-value = 0.9915
alternative hypothesis: variance increases from segment 1 to 2