Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
15 views3 pages

Code R

The document outlines a data analysis process for a water potability dataset, including data cleaning, handling missing values, and calculating statistical measures like mean, median, and standard deviation. It also includes the creation of histograms and scatter plots to visualize relationships between various water quality parameters and potability. Finally, logistic regression models are built to predict water potability based on the features in the dataset.

Uploaded by

thaingan090304
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views3 pages

Code R

The document outlines a data analysis process for a water potability dataset, including data cleaning, handling missing values, and calculating statistical measures like mean, median, and standard deviation. It also includes the creation of histograms and scatter plots to visualize relationships between various water quality parameters and potability. Finally, logistic regression models are built to predict water potability based on the features in the dataset.

Uploaded by

thaingan090304
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 3

setwd("C:/")

dt<-read.csv("water_potability.csv")

dt

apply(is.na(dt),2,which)

dt <- na.omit(dt)
dt$ph[is.na(dt$ph)] <- mean(dt$ph, na.rm = TRUE)
dt <- na.omit(dt)
dt$Sulfate[is.na(dt$Sulfate)] <- mean(dt$Sulfate, na.rm = TRUE)
dt <- na.omit(dt)
dt$Trihalomethanes[is.na(dt$Trihalomethanes)] <- mean(dt$Trihalomethanes, na.rm =
TRUE)
mean<-apply(dt,2,mean)
sd<-apply(dt,2,sd)
median<-apply(dt,2,median)
Q1<-apply(dt,2,quantile,probs=0.25)
Q3<-apply(dt,2,quantile,probs=0.75)
max<-apply(dt,2,max)
min<-apply(dt,2,min)
otput<-cbind(mean,median,sd,min,max,Q1,Q3)
hist(dt$ph,xlab="ph",main="Histogram of ph",ylim=c(0,600),col="blue",labels=T)
hist(dt$Hardness,xlab="Hardness",main="Histogram of
Hardness",ylim=c(0,600),col="blue",labels=T)
hist(dt$Solids,xlab="Solids",main="Histogram of
Solids",ylim=c(0,600),col="blue",labels=T)
hist(dt$Chloramines,xlab="Chloramines",main="Histogram of
Chloramines",ylim=c(0,600),col="blue",labels=T)
hist(dt$Sulfate,xlab="Sulfate",main="Histogram of
Sulfate",ylim=c(0,600),col="blue",labels=T)
hist(dt$Conductivity,xlab="Conductivity",main="Histogram of
Conductivity",ylim=c(0,600),col="blue",labels=T)
hist(dt$Organic_carbon,xlab="Organic_carbon",main="Histogram of
Organic_carbon",ylim=c(0,600),col="blue",labels=T)
hist(dt$Trihalomethanes,xlab="Trihalomethanes",main="Histogram of
Trihalomethanes",ylim=c(0,600),col="blue",labels=T)
hist(dt$Turbidity,xlab="Turbidity",main="Histogram of
Turbidity",ylim=c(0,600),col="blue",labels=T)
hist(dt$Potability,xlab="Potability",main="Histogram of
Potability",ylim=c(0,1500),col="blue",labels=T)
plot(Potability~ph,data=dt,xlab="ph",ylab="Potability",main="Plot of ph and Potability
",col="blue")
plot(Potability~Hardness,data=dt,xlab="Hardness",ylab="Potability",main="Plot of
Hardness and Potability ",col="blue")
plot(Potability~Solids,data=dt,xlab="Solids",ylab="Potability",main="Plot of Solids and
Potability ",col="blue")
plot(Potability~Chloramines,data=dt,xlab="Chloramines",ylab="Potability",main="Plot
of Chloramines and Potability ",col="blue")
plot(Potability~Sulfate,data=dt,xlab="Sulfate",ylab="Potability",main="Plot of Sulfate
and Potability ",col="blue")
plot(Potability~Conductivity,data=dt,xlab="Conductivity",ylab="Potability",main="Plot
of Conductivity and Potability ",col="blue")
plot(Potability~Organic_carbon,data=dt,xlab="Organic_carbon",ylab="Potability",main=
"Plot of Organic_carbon and Potability ",col="blue")
plot(Potability~Trihalomethanes,data=dt,xlab="Trihalomethanes",ylab="Potability",main
="Plot of Trihalomethanes and Potability ",col="blue")
plot(Potability~Turbidity,data=dt,xlab="Turbidity",ylab="Potability",main="Plot of
Turbidity and Potability ",col="blue")
set.seed(100)
split= sample.split(dt$Potability,SplitRatio = 0.65)
mauxaydung = subset(dt,split==TRUE)
maukiemdinh = subset(dt,split==FALSE)
mohinh = glm(Potability~.,data=mauxaydung,family = binomial)
summary(mohinh)
dubaokiemdinh = predict(mohinh,type="response",newdata= maukiemdinh)
summary(dubaokiemdinh)
table(maukiemdinh$Potability,dubaokiemdinh>0.5)
(2+3)/nrow(maukiemdinh)

cor_matrix <- cor(dt)

print(cor_matrix)

dt$interaction1 <- dt$ph * dt$Hardness


set.seed(100)
split= sample.split(dt$Potability,SplitRatio = 0.65)
mauxaydung = subset(dt,split==TRUE)
maukiemdinh = subset(dt,split==FALSE)
mohinh2<-
glm(Potability~ph+Hardness+Solids+Chloramines+Sulfate+Conductivity+Organic_carb
on+Trihalomethanes+Turbidity+interaction1 ,data=dt)
summary(mohinh2)
dubaokiemdinh = predict(mohinh2,type="response",newdata= maukiemdinh)
summary(dubaokiemdinh)
table(maukiemdinh$Potability,dubaokiemdinh>0.5)
(3+5)/nrow(maukiemdinh)

You might also like