# --------------------------------------------------------------------
# Develop by Prof. Carlito O. Daarol
# Math Department
# Mindanao State University
# General Santos City
# September 12, 2023
#
# List of functions intended as support for Exploratory Data Analysis
# compute correlation coefficient
# plot scatterplot and graphics
# Check for normality
# --------------------------------------------------------------------
# 1. correlation - function to compute Pearson r correlation using the summation
formula
# using two vectors or arrays X and Y
# 2. Correcorre - function to compute correlation using a dataframe of any size as
input
# 3. pairwiseCor - function to compute pairwise correlations from two sets of
variables
# 4. singlesetCor - function to compute pairwise correlation from 1 set of
variables
# 5. CorrsjPlot - function to compute pairwise correlation using a dataframe and
sjPlot package
# 6. DrawCorrelations - function to draw correlation image or heat map using R
package
# 7 .Read csv - function to read csv file data from any location
# 8. CorrePlotXY - function to Perform correlation plot for two variables
# 9. CheckforNormality - function to check for normality of any numeric variable
# using Wilk-Shapiro test
# 10. QQNormality_Plot - function to check and plot nornmality of distribution
# 11. QQPlot - function to check and plot nornmality using R package
# 12 PlotHistDensity - function to Plot Histogram and Density using a dataframe as
input
#------------------------------------------------------------
# Function to compute Pearson Correlation Using the formula
# using two vectors as input data
# -----------------------------------------------------------
correlation = function(datX,datY){
sumX <- sum(datX)
sumY <- sum(datY)
sumXY <- sum(datX*datY)
sumxsqr <- sum(datX*datX)
sumysqr <- sum(datY*datY)
n <- length(datX)
numerator <- n*sumXY - sumX*sumY
denominator <- (n*sumxsqr - sumX*sumX) * ((n*sumysqr - sumY*sumY))
r = numerator / sqrt(denominator)
return(r)
}
# -----------------------------------------------------------
# Function to compute Pearson Correlation Using a dataframe
# the dataframe may cocnsist several columns
# -----------------------------------------------------------
Correcorre = function(data){
counter = 0
result <- NULL
limit1 <- ncol(data)-1
limit2 <- ncol(data)
for (i in 1:limit1){
for ( j in (i+1):limit2){
counter <- counter +1
result[counter] <- correlation(data[,i],data[,j])
}
}
return(result)
}
# --------------------------------------------------------------------------
# Function to compute Pearson Correlation using two sets of variables
# -------------------------------------------------------------------------
pairwiseCor <- function(depvar,indepvar,corrtype){
# depvar, indepvar and pairwiseCor are temporary variables.
# you can choose any name that you want
# establish counter to count how many pairs are possible
counter =0
for(i in 1:length(depvar)){
for (j in 1:length(indepvar)){
counter = counter +1
}
}
# define holder of correlation results
df <- data.frame(VarX =rep(0,counter), VarY=rep(0,counter),
AbsCor=rep(0,counter), Cor=rep(0,counter),
PValue=rep(0,counter),
Assessment=rep(0,counter))
# fill in the dataframe with details
k=0
for(i in 1:length(depvar)){
for (j in 1:length(indepvar)){
k=k+1
df[k,1] <- names(depvar)[i] # pick
the first variable
df[k,2] <- names(indepvar)[j] #
pickup the second variable
df[k,3] <- round(abs(cor(depvar[,i],indepvar[,j])),2) #
compute the absolute value of the correlation
df[k,4] <- round(cor(depvar[,i],indepvar[,j]),2) #
compute the correlation coefficient (pos or negative result)
tt <- cor.test(depvar[,i],indepvar[,j],method=corrtype) #
defines the tt object to store other info like pvalue
df[k,5] <- round(tt$p.value,2) #
pickup the p=value for test of significance
msg1 <- "Reject Ho. Linear Relationship is true"
msg2 <- "Not correlated variables"
df[k,6] <- ifelse((df[k,5] < 0.05), msg1, msg2)
}
}
colnames(df) <- c("VariableX", "VariableY", "AbsCor","Cor","PValue","Assessment
(Pvalue <0.05)")
return(df) #
returns the entire result as a dataframe
}
# -----------------------------------------------------------------------
# Function to compute pairwise using s single set of variables
# -----------------------------------------------------------------------
# function to compute pairwise correlation from a single sets
singlesetCor <- function(singleset,corrtype){
# establish counter to count how many pairs are possible
numcol = ncol(singleset)
counter =0
for(i in 1:(numcol-1)){
j= i+1
for (k in j:numcol){
counter = counter +1
}
counter
}
# define holder of correlation results
df <- data.frame(VarX =rep(0,counter), VarY=rep(0,counter),
AbsCor=rep(0,counter), Cor=rep(0,counter),
PValue=rep(0,counter),
Assessment=rep(0,counter))
# fill in the dataframe with details
k=0
for(i in 1:(numcol-1)){
n= i+1
for (j in n:numcol){
k = k +1
df[k,1] <- names(singleset)[i] #
pick the first variable
df[k,2] <- names(singleset)[j] #
pickup the second variable
df[k,3] <- round(abs(cor(singleset[,i],singleset[,j])),2) #
compute the absolute value of the correlation
df[k,4] <- round(cor(singleset[,i],singleset[,j]),2) #
compute the correlation coefficient (pos or negative result)
tt <- cor.test(singleset[,i],singleset[,j],method=corrtype) #
defines the tt object to store other info like pvalue
df[k,5] <- round(tt$p.value,2) #
pickup the p=value for test of significance
df[k,6] <- ifelse((df[k,5] < 0.05), "<0.05 (sig)", "> 0.05 (not sig") #
determines if coefficient is significant or not
}
}
return(df) #
returns the entire result as a dataframe
}
# -----------------------------------------------------------
# function to compute pairwise correlation using R packages
# -----------------------------------------------------------
CorrsjPlot <- function(data,corrtype,mtitle){
library(sjPlot)
tab_corr(data,
na.deletion = "pairwise",
corr.method = corrtype,
title = mtitle,
var.labels = NULL,
wrap.labels = 40,
show.p = TRUE,
p.numeric = FALSE,
fade.ns = TRUE,
val.rm = NULL,
digits = 3,
triangle = "lower",
string.diag = NULL,
CSS = NULL,
encoding = NULL,
file = NULL,
use.viewer = TRUE,
remove.spaces = TRUE)
}
# ----------------------------------------------------
# function to draw Correlations using R packages
# ---------------------------------------------------
DrawCorrelations = function(cordata){
require(mvtnorm)
par(mfrow = c(2, 3), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))
xlen <- length(cordata)
for (i in 1:xlen){
S <- matrix(c(1,cordata[i],cordata[i],1),2,2)
AB <- rmvnorm(mean=c(0,0),sig=S,n=100)
U <- pnorm(AB) #Now U is uniform
x <- qnorm(U[,1],1,.02) #y is beta distributed with two shape parameters 1 and
2
y <- qnorm(U[,2],1,.02) #y is beta distributed with two shape parameters 1 and
2
corval <- cor(x,y)
plot(x,y, main = paste0(" Pearson r = ",round(corval,3)))
fit <- lm(y~x)
abline(fit, col = "red")
}
par(mfrow=c(1,1))
#return(invisible())
}
# Read csv data from any location
readcsv <- function(datapath,dataname){
data <- read.csv(paste(datapath,"/",dataname,sep=""))
return(data)
}
## Perform correlation plot for two variables
CorrePlotXY <- function(data,X,Y,color,Xlab,Ylab,corrtype){
library(ggpubr)
ggscatter(data, x = X, y = Y,
color = color, cor.coef = TRUE,
cor.method = corrtype,
xlab = Xlab, ylab = Ylab,
add= "reg.line")
}
CheckforNormality = function(dat){
library(dplyr)
df <- dat %>%
group_by(variable) %>%
summarise(`W Statistic` = shapiro.test(Score)$statistic,
`p-value` = shapiro.test(Score)$p.value)
df <- as.data.frame(df)
df$`Variable Distribution Assessment` <- ""
for (k in 1:nrow(df)){
df[k,2] <- round(df[k,2],3)
df[k,3] <- round(df[k,3],3)
pval <- round(df[k,3],2)
if (pval < 0.05) {
msg <- paste0(df[k,1] ," is not normal(pval<.05)")
} else {
msg <- paste0(df[k,1] ," distribution is normal")
}
df[k,4] <- msg
}
return(df)
}
# Function call: Normality Test
QQNormality_Plot = function(data_long){
colnames(data_long) <- c("variable","value")
ggplot(data = data_long, mapping = aes(sample = value, color = variable, fill =
variable)) +
stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot", B=5000) +
stat_qq_line(identity=TRUE) +
stat_qq_point(col="black") +
ggtitle("QQ Plot for Normality of Distribution \n Points must lie inside the
confidence band")+
facet_wrap(~ variable, scales = "free") +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()
}
# Function call: Normality Test
QQPlot = function(data_long, Toword, fileout){
library(ggplot2)
ggplot(data = data_long, mapping = aes(sample = value, color = variable, fill =
variable)) +
stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot", B=500) +
stat_qq_line(identity=TRUE) +
stat_qq_point(col="black") +
ggtitle("Quantile-Quantile Test for Normality \n Normally distributed if points
are inside the confidence band")+
facet_wrap(~ variable, scales = "free") +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()
return(invisible())
}
PlotHistDensity = function(vars,data_wide){
par(mfrow = c(3,2))
for (i in 1:3){
dat <- na.omit(data_wide[,i])
plot(dat)
x = seq(from=min(dat), to=max(dat), length.out=50)
norm_dist = dnorm(x, mean=mean(dat), sd=sd(dat)) * (max(dat)-min(dat))/
20*length(dat)
hist(dat,main=colnames(data_wide)[i])
lines(x, norm_dist, col='violet',lwd=4)
}
par(mfrow = c(1,1))
}