0% found this document useful (0 votes)

31 views10 pages

As 2

Uploaded by

Aadesh Srivastav

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

31 views10 pages

As 2

Uploaded by

Aadesh Srivastav

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 10

Input:

setwd("C:\\Users\\Baba\\Desktop\\Advance Stat")

library(readxl)

mydataset<-read_xlsx("Dataset_LeslieSalt.xlsx")

summary(mydataset)

str(mydataset)

attach(mydataset)

#Converting country and flood into factors

mydataset$County<-factor(mydataset$County, levels=c("0","1"), labels = c("San Meteo", "Sant

Clara"))

mydataset$Flood<-factor(mydataset$Flood,levels = c("0","1"),labels = c("Flood-Yes","Flood-No"))

##Load the psych and lmtest Libraries

library(psych)

library(lmtest)

library(zoo)

describe(mydataset)

##identifying outliers

boxplot(mydataset$Price, main="Boxplot for price")

#treating outlier for price

mydataset<-mydataset[-26,]

#indentifying correlation

mydataset_cor <- as.matrix(dplyr::select_if(mydataset, is.numeric))

corrplot(cor(mydataset_cor),order="hclust",main="corelation plot")

#observation- high correlation between price and date followed by Elevation and low correlation
with size and distance

#considering multicolinertity between independent variables size and distance are correlated with
each other

#running regression model

model1 <- lm(Price ~ ., data= mydataset)

summary(model1)

anova(model1)
#removing correlated independent variables

model2<-lm(Price~.-Distance -Size,data=mydataset)

summary(model2)

#P value is less than 0.05 and r square value is same as model 1 thus accepting the model

##Extract the fitted values and residual values from the model2 output

fitted(model2)

residuals(model2)

fit2 <- fitted(model2)

res2 <- residuals(model2)

##Merge the fitted and residual values with Consumer dataset for comparison sake

dataset_reg <- cbind(mydataset, fit2, res2)

##Plot the actual versus fitted values in a plot

with(dataset_reg, plot(fit2,res2, pch=19, cex=0.6))

abline(a=0,b=0)

#prediction for problem statement

#county=Sant Clara

#Elevation=no elevation=0

#sewer= no clear from problem statement hence keeping it 0

#Date=2 months,4 months, 6months

#flood= no flood

#distance=0

mydataset2<-data.frame(Price=0,County="Sant
Clara",Size=246.8,Elevation=0,Sewer=0,Date=6,Flood="Flood-No",Distance=0)

mydataset2$price_predection<-predict(model2,mydataset2)*1000
mydataset2$price_predection

##Multivariate normality Test

with(mydataset, shapiro.test(Size))

with(mydataset, shapiro.test(Elevation))

with(mydataset, shapiro.test(Sewer))

with(mydataset, shapiro.test(Date))

with(mydataset, shapiro.test(Distance))

library(car)

#VIF - Variation inflation factor

vif(model2)

#Durbin Watson Test to test Auto Correlation

##Null Hypothesis states that there is No auto-correlation

#Alternate Hypothesis states there is autocorrelation

dwtest(model2)

## Homoscedasticity tested using Goldfelt Quant test

#Null hypothesis : Data satisfies the condiction of homoscedasticity

##Alternate hypothesis states data is not Homoscedastic

gqtest(model2)
output:
> setwd("C:\\Users\\Baba\\Desktop\\Advance Stat")
> library(readxl)
> mydataset<-read_xlsx("Dataset_LeslieSalt.xlsx")
> summary(mydataset)
Price County Size Elevation
Sewer Date Flood
Min. : 1.70 Min. :0.0000 Min. : 6.90 Min. : 0.000 Min.
: 0 Min. :-103.00 Min. :0.0000
1st Qu.: 5.35 1st Qu.:0.0000 1st Qu.: 20.35 1st Qu.: 2.000 1st
Qu.: 0 1st Qu.: -63.50 1st Qu.:0.0000
Median :11.70 Median :1.0000 Median : 51.40 Median : 4.000
Median : 900 Median : -59.00 Median :0.0000
Mean :11.95 Mean :0.6129 Mean : 139.97 Mean : 4.645 Mean
: 1981 Mean : -58.65 Mean :0.1613
3rd Qu.:16.05 3rd Qu.:1.0000 3rd Qu.: 104.10 3rd Qu.: 7.000 3rd
Qu.: 3450 3rd Qu.: -51.00 3rd Qu.:0.0000
Max. :37.20 Max. :1.0000 Max. :1695.20 Max. :20.000 Max.
:10000 Max. : -4.00 Max. :1.0000
Distance
Min. : 0.000
1st Qu.: 0.850
Median : 4.900
Mean : 5.132
3rd Qu.: 5.500
Max. :16.500
> str(mydataset)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31 obs. of 8 variables:
$ Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
$ County : num 1 1 0 0 0 1 1 1 1 1 ...
$ Size : num 138.4 52 16.1 1695.2 845 ...
$ Elevation: num 10 4 0 1 1 2 4 4 20 0 ...
$ Sewer : num 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
$ Date : num -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
$ Flood : num 0 0 1 0 1 0 0 0 0 0 ...
$ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
> attach(mydataset)
> #Converting country Aand flood into factors
> mydataset$County<-factor(mydataset$County, levels=c("0","1"), labels =
c("San Meteo", "Sant Clara"))
> mydataset$Flood<-factor(mydataset$Flood,levels = c("0","1"),labels =
c("Flood-Yes","Flood-No"))
> ##Load the psych and lmtest Libraries
> library(psych)
> library(lmtest)
> library(zoo)
> describe(mydataset)
vars n mean sd median trimmed mad min max
range skew kurtosis se
Price 1 31 11.95 7.71 11.7 11.21 8.90 1.7 37.2
35.5 1.03 1.39 1.39
County* 2 31 1.61 0.50 2.0 1.64 0.00 1.0 2.0
1.0 -0.44 -1.86 0.09
Size 3 31 139.97 327.17 51.4 57.98 52.34 6.9 1695.2
1688.3 3.77 14.18 58.76
Elevation 4 31 4.65 4.36 4.0 4.12 2.97 0.0 20.0
20.0 1.43 2.61 0.78
Sewer 5 31 1981.29 2481.31 900.0 1576.80 1334.34 0.0 10000.0
10000.0 1.29 1.30 445.66
Date 6 31 -58.65 24.53 -59.0 -59.56 8.90 -103.0 -4.0
99.0 0.20 0.08 4.41
Flood* 7 31 1.16 0.37 1.0 1.08 0.00 1.0 2.0
1.0 1.75 1.11 0.07
Distance 8 31 5.13 4.54 4.9 4.58 3.56 0.0 16.5
16.5 0.81 -0.10 0.81
> ##identifying outliers
> boxplot(mydataset$Price, main="Boxplot for price")
> #treating outlier for price
> mydataset<-mydataset[-26,]
> #indentifying correlation
> mydataset_cor <- as.matrix(dplyr::select_if(mydataset, is.numeric))
> corrplot(cor(mydataset_cor),order="hclust",main="corelation plot")
> #running regression model
> model1 <- lm(Price ~ ., data= mydataset)
> summary(model1)

Call:
lm(formula = Price ~ ., data = mydataset)

Residuals:
Min 1Q Median 3Q Max
-3.7059 -2.6043 -0.3876 2.2315 4.7774

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.6267827 2.9067195 6.408 1.9e-06 ***
CountySant Clara -2.6365930 2.8842949 -0.914 0.37056
Size -0.0034320 0.0025420 -1.350 0.19070
Elevation 0.5407713 0.1693998 3.192 0.00421 **
Sewer -0.0005078 0.0003100 -1.638 0.11563
Date 0.1279277 0.0356334 3.590 0.00163 **
FloodFlood-No -7.8400025 2.2885764 -3.426 0.00242 **
Distance 0.4097406 0.2453188 1.670 0.10904
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.145 on 22 degrees of freedom

Multiple R-squared: 0.8069, Adjusted R-squared: 0.7454
F-statistic: 13.13 on 7 and 22 DF, p-value: 1.493e-06

> anova(model1)
Analysis of Variance Table

Response: Price
Df Sum Sq Mean Sq F value Pr(>F)
County 1 3.432 3.432 0.3470 0.561832
Size 1 93.775 93.775 9.4796 0.005489 **
Elevation 1 300.884 300.884 30.4159 1.528e-05 ***
Sewer 1 50.352 50.352 5.0900 0.034341 *
Date 1 292.905 292.905 29.6094 1.822e-05 ***
Flood 1 140.192 140.192 14.1718 0.001069 **
Distance 1 27.596 27.596 2.7897 0.109040
Residuals 22 217.631 9.892
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> #removing correlated independent variables
> model2<-lm(Price~.-Distance -Size,data=mydataset)
> summary(model2)

Call:
lm(formula = Price ~ . - Distance - Size, data = mydataset)

Residuals:
Min 1Q Median 3Q Max
-5.0186 -2.2651 -0.3114 2.1549 5.1596

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 22.0187525 1.9634490 11.214 5.01e-11 ***
CountySant Clara -4.4613706 1.8189990 -2.453 0.02183 *
Elevation 0.5086667 0.1726287 2.947 0.00704 **
Sewer -0.0006846 0.0002789 -2.455 0.02173 *
Date 0.1308357 0.0276699 4.728 8.28e-05 ***
FloodFlood-No -7.6795702 2.1524916 -3.568 0.00156 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.252 on 24 degrees of freedom

Multiple R-squared: 0.7747, Adjusted R-squared: 0.7278
F-statistic: 16.51 on 5 and 24 DF, p-value: 4.372e-07

> ##Extract the fitted values and residual values from the model2 output
> fitted(model2)
1 2 3 4 5 6
7 8 9 10 11
7.1142451 6.1159761 -0.2899951 7.9636838 2.1263917 0.4770790
10.6952240 11.2185666 18.5981189 5.3381093 5.4689450
12 13 14 15 16 17
18 19 20 21 22
8.1526461 8.9585266 13.9074116 14.9247449 14.4160783 13.9074116
12.8900783 15.4334116 13.9074116 12.2153192 13.0355899
23 24 25 26 27 28
29 30
11.6404255 7.0245936 8.8527918 16.4336650 16.6495546 20.1411536
14.0862180 21.8966239
> residuals(model2)
1 2 3 4 5
6 7 8 9
-2.614245145 4.484023886 1.989995150 -2.963683786 2.873608286
2.822921024 -4.995224024 -5.018566643 0.801881101
10 11 12 13 14
15 16 17 18
-2.138109322 -0.768944976 -1.252646125 -0.858526620 -2.307411589
4.375255075 -2.716078257 -0.607411589 2.209921748
19 20 21 22 23
24 25 26 27
-3.033411594 1.392588411 -0.015319150 5.064410143 5.159574493
-1.124593556 -4.852791848 1.766335026 -1.549554634
28 29 30
2.758846438 1.113781968 0.003376106
> fit2 <- fitted(model2)
> res2 <- residuals(model2)
> ##Merge the fitted and residual values with Consumer dataset for
comparison sake
> dataset_reg <- cbind(mydataset, fit2, res2)
> ##Plot the actual versus fitted values in a plot
> with(dataset_reg, plot(fit2,res2, pch=19, cex=0.6))
> abline(a=0,b=0)
> mydataset2<-data.frame(Price=0,County="Sant
Clara",Size=246.8,Elevation=0,Sewer=0,Date=6,Flood="Flood-No",Distance=0)
> mydataset2$price_predection<-predict(model2,mydataset2)*1000
> mydataset2$price_predection
[1] 10662.83
> ##Multivariate normality Test
> with(mydataset, shapiro.test(Size))

Shapiro-Wilk normality test

data: Size
W = 0.41142, p-value = 6.968e-10

> with(mydataset, shapiro.test(Elevation))

Shapiro-Wilk normality test

data: Elevation
W = 0.85716, p-value = 0.0008791

> with(mydataset, shapiro.test(Sewer))

Shapiro-Wilk normality test

data: Sewer
W = 0.81081, p-value = 0.0001039
> with(mydataset, shapiro.test(Date))

Shapiro-Wilk normality test

data: Date
W = 0.90551, p-value = 0.01149

> with(mydataset, shapiro.test(Distance))

Shapiro-Wilk normality test

data: Distance
W = 0.866, p-value = 0.001366

> library(car)
> #VIF - Variation inflation factor
> vif(model2)
County Elevation Sewer Date Flood
2.179288 1.603015 1.328278 1.277447 1.825141
> #Durbin Watson Test to test Auto Correlation
> ##Null Hypothesis states that there is No auto-correlation
> #Alternate Hypothesis states there is autocorrelation
> dwtest(model2)

Durbin-Watson test

data: model2
DW = 2.112, p-value = 0.491
alternative hypothesis: true autocorrelation is greater than 0

> ## Homoscedasticity tested using Goldfelt Quant test

> #Null hypothesis : Data satisfies the condiction of homoscedasticity
> ##Alternate hypothesis states data is not Homoscedastic
> gqtest(model2)

Goldfeld-Quandt test

data: model2
GQ = 0.17811, df1 = 9, df2 = 9, p-value = 0.9915
alternative hypothesis: variance increases from segment 1 to 2

Leslie Salt Property Project Report
No ratings yet
Leslie Salt Property Project Report
10 pages
Lesllie Salt Company
No ratings yet
Lesllie Salt Company
15 pages
Fds QB
No ratings yet
Fds QB
6 pages
HW 3
No ratings yet
HW 3
20 pages
soruma-SECOND-ASSEsiment L Reg
No ratings yet
soruma-SECOND-ASSEsiment L Reg
33 pages
Gdpforecast.r: Rehanshu Vij 2020-12-10
No ratings yet
Gdpforecast.r: Rehanshu Vij 2020-12-10
10 pages
Soruma SECOND ASSEsiment Final L Reg
No ratings yet
Soruma SECOND ASSEsiment Final L Reg
34 pages
Spatial Statistics in R
No ratings yet
Spatial Statistics in R
29 pages
HHH
No ratings yet
HHH
30 pages
Spatial Statistics in R
No ratings yet
Spatial Statistics in R
29 pages
GianluigiDeRubertis 228766
No ratings yet
GianluigiDeRubertis 228766
9 pages
L21 ECO220 Print
No ratings yet
L21 ECO220 Print
16 pages
Analysis Document
No ratings yet
Analysis Document
7 pages
R Functions
No ratings yet
R Functions
8 pages
Simple Linear Regression in R
No ratings yet
Simple Linear Regression in R
17 pages
Lab Book
No ratings yet
Lab Book
24 pages
Swd325 Practical Solution
No ratings yet
Swd325 Practical Solution
9 pages
Courseproject 2 RMD
No ratings yet
Courseproject 2 RMD
7 pages
Granger Causality and VAR Models
No ratings yet
Granger Causality and VAR Models
1 page
R Programming Notes
No ratings yet
R Programming Notes
35 pages
Yaikob Second Assesiment Final
No ratings yet
Yaikob Second Assesiment Final
33 pages
R Programs 2024-2025
No ratings yet
R Programs 2024-2025
13 pages
Lab Wk1soln PDF
No ratings yet
Lab Wk1soln PDF
14 pages
Part A R Programming
No ratings yet
Part A R Programming
10 pages
Weather Impact on Health & Economy
No ratings yet
Weather Impact on Health & Economy
5 pages
BZAN 535: Linear Regression
No ratings yet
BZAN 535: Linear Regression
11 pages
Problem Set 3: General Guideline
No ratings yet
Problem Set 3: General Guideline
12 pages
Langkah-Langkah Pemodelan Trend Analisis Model Regresi Regression Analysis: Data Versus T
No ratings yet
Langkah-Langkah Pemodelan Trend Analisis Model Regresi Regression Analysis: Data Versus T
14 pages
Bai11 1 1
No ratings yet
Bai11 1 1
11 pages
Analysis Course HW1
No ratings yet
Analysis Course HW1
5 pages
R 5 Marks
No ratings yet
R 5 Marks
11 pages
Dissertation Model
No ratings yet
Dissertation Model
23 pages
R Cheat Sheets for ECON1267
No ratings yet
R Cheat Sheets for ECON1267
13 pages
418 Material
No ratings yet
418 Material
16 pages
Heathrow Sunshine Time Series Analysis
No ratings yet
Heathrow Sunshine Time Series Analysis
19 pages
Matrix, Dataframes, List
No ratings yet
Matrix, Dataframes, List
8 pages
Analysis Using Statistical: Introduction & Data Exploration
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
Ohyeah
No ratings yet
Ohyeah
4 pages
Chapter 16. Simultaneous Equations Models
No ratings yet
Chapter 16. Simultaneous Equations Models
23 pages
DA Lab
No ratings yet
DA Lab
27 pages
Data Visualization 19bce0761 - Parth Sharma Slot: F2 Theory Da
No ratings yet
Data Visualization 19bce0761 - Parth Sharma Slot: F2 Theory Da
33 pages
Ordinary Kriging in R
No ratings yet
Ordinary Kriging in R
2 pages
07exercise Solution
No ratings yet
07exercise Solution
9 pages
Old Faithful Geyser Data Analysis
No ratings yet
Old Faithful Geyser Data Analysis
12 pages
R Programming Codes Linear Regression
No ratings yet
R Programming Codes Linear Regression
20 pages
LP Prcatical 2 Jupyter Notebook
No ratings yet
LP Prcatical 2 Jupyter Notebook
5 pages
Data Science Practicals
No ratings yet
Data Science Practicals
47 pages
SML Practical 1to11
No ratings yet
SML Practical 1to11
23 pages
A Short List of Some Useful R Commands: Input and Display
No ratings yet
A Short List of Some Useful R Commands: Input and Display
2 pages
Bacs HW1
No ratings yet
Bacs HW1
6 pages
A Short List of The Most Useful R Commands
No ratings yet
A Short List of The Most Useful R Commands
8 pages
Regression 2
No ratings yet
Regression 2
52 pages
Essential R Commands Guide
No ratings yet
Essential R Commands Guide
11 pages
R Program
No ratings yet
R Program
22 pages
Exame Do Dia 13 12 2019
No ratings yet
Exame Do Dia 13 12 2019
8 pages
Cycle Time Reduction of Queue Hour Calculation - Final
100% (1)
Cycle Time Reduction of Queue Hour Calculation - Final
37 pages
DVT Dashboard-1
No ratings yet
DVT Dashboard-1
1 page
Project Notes 1: Purpose of Document
100% (1)
Project Notes 1: Purpose of Document
25 pages
As 2
No ratings yet
As 2
10 pages
Using A Windows 7 Installation Disk: Backup Your Files
No ratings yet
Using A Windows 7 Installation Disk: Backup Your Files
15 pages

As 2

Uploaded by

As 2

Uploaded by

Input:

#Converting country and flood into factors

mydataset$County<-factor(mydataset$County, levels=c("0","1"), labels = c("San Meteo", "Sant

mydataset$Flood<-factor(mydataset$Flood,levels = c("0","1"),labels = c("Flood-Yes","Flood-No"))

##Load the psych and lmtest Libraries

boxplot(mydataset$Price, main="Boxplot for price")

#treating outlier for price

mydataset_cor <- as.matrix(dplyr::select_if(mydataset, is.numeric))

#running regression model

model1 <- lm(Price ~ ., data= mydataset)

fit2 <- fitted(model2)

res2 <- residuals(model2)

dataset_reg <- cbind(mydataset, fit2, res2)

##Plot the actual versus fitted values in a plot

with(dataset_reg, plot(fit2,res2, pch=19, cex=0.6))

#prediction for problem statement

#sewer= no clear from problem statement hence keeping it 0

#Date=2 months,4 months, 6months

##Multivariate normality Test

#VIF - Variation inflation factor

#Durbin Watson Test to test Auto Correlation

##Null Hypothesis states that there is No auto-correlation

#Alternate Hypothesis states there is autocorrelation

## Homoscedasticity tested using Goldfelt Quant test

#Null hypothesis : Data satisfies the condiction of homoscedasticity

##Alternate hypothesis states data is not Homoscedastic

Residual standard error: 3.145 on 22 degrees of freedom

Residual standard error: 3.252 on 24 degrees of freedom

Shapiro-Wilk normality test

> with(mydataset, shapiro.test(Elevation))

Shapiro-Wilk normality test

> with(mydataset, shapiro.test(Sewer))

Shapiro-Wilk normality test

Shapiro-Wilk normality test

> with(mydataset, shapiro.test(Distance))

Shapiro-Wilk normality test

> ## Homoscedasticity tested using Goldfelt Quant test

You might also like