setwd("C:/")
dt<-read.csv("water_potability.csv")
dt
apply(is.na(dt),2,which)
dt <- na.omit(dt)
dt$ph[is.na(dt$ph)] <- mean(dt$ph, na.rm = TRUE)
dt <- na.omit(dt)
dt$Sulfate[is.na(dt$Sulfate)] <- mean(dt$Sulfate, na.rm = TRUE)
dt <- na.omit(dt)
dt$Trihalomethanes[is.na(dt$Trihalomethanes)] <- mean(dt$Trihalomethanes, na.rm =
TRUE)
mean<-apply(dt,2,mean)
sd<-apply(dt,2,sd)
median<-apply(dt,2,median)
Q1<-apply(dt,2,quantile,probs=0.25)
Q3<-apply(dt,2,quantile,probs=0.75)
max<-apply(dt,2,max)
min<-apply(dt,2,min)
otput<-cbind(mean,median,sd,min,max,Q1,Q3)
hist(dt$ph,xlab="ph",main="Histogram of ph",ylim=c(0,600),col="blue",labels=T)
hist(dt$Hardness,xlab="Hardness",main="Histogram of
Hardness",ylim=c(0,600),col="blue",labels=T)
hist(dt$Solids,xlab="Solids",main="Histogram of
Solids",ylim=c(0,600),col="blue",labels=T)
hist(dt$Chloramines,xlab="Chloramines",main="Histogram of
Chloramines",ylim=c(0,600),col="blue",labels=T)
hist(dt$Sulfate,xlab="Sulfate",main="Histogram of
Sulfate",ylim=c(0,600),col="blue",labels=T)
hist(dt$Conductivity,xlab="Conductivity",main="Histogram of
Conductivity",ylim=c(0,600),col="blue",labels=T)
hist(dt$Organic_carbon,xlab="Organic_carbon",main="Histogram of
Organic_carbon",ylim=c(0,600),col="blue",labels=T)
hist(dt$Trihalomethanes,xlab="Trihalomethanes",main="Histogram of
Trihalomethanes",ylim=c(0,600),col="blue",labels=T)
hist(dt$Turbidity,xlab="Turbidity",main="Histogram of
Turbidity",ylim=c(0,600),col="blue",labels=T)
hist(dt$Potability,xlab="Potability",main="Histogram of
Potability",ylim=c(0,1500),col="blue",labels=T)
plot(Potability~ph,data=dt,xlab="ph",ylab="Potability",main="Plot of ph and Potability
",col="blue")
plot(Potability~Hardness,data=dt,xlab="Hardness",ylab="Potability",main="Plot of
Hardness and Potability ",col="blue")
plot(Potability~Solids,data=dt,xlab="Solids",ylab="Potability",main="Plot of Solids and
Potability ",col="blue")
plot(Potability~Chloramines,data=dt,xlab="Chloramines",ylab="Potability",main="Plot
of Chloramines and Potability ",col="blue")
plot(Potability~Sulfate,data=dt,xlab="Sulfate",ylab="Potability",main="Plot of Sulfate
and Potability ",col="blue")
plot(Potability~Conductivity,data=dt,xlab="Conductivity",ylab="Potability",main="Plot
of Conductivity and Potability ",col="blue")
plot(Potability~Organic_carbon,data=dt,xlab="Organic_carbon",ylab="Potability",main=
"Plot of Organic_carbon and Potability ",col="blue")
plot(Potability~Trihalomethanes,data=dt,xlab="Trihalomethanes",ylab="Potability",main
="Plot of Trihalomethanes and Potability ",col="blue")
plot(Potability~Turbidity,data=dt,xlab="Turbidity",ylab="Potability",main="Plot of
Turbidity and Potability ",col="blue")
set.seed(100)
split= sample.split(dt$Potability,SplitRatio = 0.65)
mauxaydung = subset(dt,split==TRUE)
maukiemdinh = subset(dt,split==FALSE)
mohinh = glm(Potability~.,data=mauxaydung,family = binomial)
summary(mohinh)
dubaokiemdinh = predict(mohinh,type="response",newdata= maukiemdinh)
summary(dubaokiemdinh)
table(maukiemdinh$Potability,dubaokiemdinh>0.5)
(2+3)/nrow(maukiemdinh)
cor_matrix <- cor(dt)
print(cor_matrix)
dt$interaction1 <- dt$ph * dt$Hardness
set.seed(100)
split= sample.split(dt$Potability,SplitRatio = 0.65)
mauxaydung = subset(dt,split==TRUE)
maukiemdinh = subset(dt,split==FALSE)
mohinh2<-
glm(Potability~ph+Hardness+Solids+Chloramines+Sulfate+Conductivity+Organic_carb
on+Trihalomethanes+Turbidity+interaction1 ,data=dt)
summary(mohinh2)
dubaokiemdinh = predict(mohinh2,type="response",newdata= maukiemdinh)
summary(dubaokiemdinh)
table(maukiemdinh$Potability,dubaokiemdinh>0.5)
(3+5)/nrow(maukiemdinh)