########################################################################
# Data Visualization - 01
########################################################################
rm(list=ls())
# Importing data csv files
# https://www.kaggle.com/datasets/spscientist/students-performance-in-
exams?resource=download
# Insert the address of "Data.csv" file
help("read.csv")
#data <- read.csv("C:/Users/DIM/Downloads/Data.csv")
read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science _MDAN
51163/Lecturer 01/Data.csv")
data <- read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science
_MDAN 51163/Lecturer 01/Data.csv")
data = read.csv("F:/Msc(Kealniya)/1st Semi/Statistics_for_Data_Science
_MDAN 51163/Lecturer 01/Data.csv")
####### Data exploration
head(data)
# Data structure
str(data)
########################################################################
# Quantitative data
########################################################################
data$math_score
# Plot the histogram of math_score
help("hist")
hist(data$math_score)
hist(data$reading_score)
hist(data$math_score,
main="Histogram of Math Score",
xlab="Math Score", ylab="Frequency")
# Frequency polygon of the math_score
#install.packages("ggplot2")
library(ggplot2)
help(ggplot)
ggplot(data, aes(data$math_score)) +
geom_freqpoly(bins=10)
ggplot(data, aes(math_score)) + geom_freqpoly(bins=10)
# Density plot of math_score
density(data$math_score)
plot(density(data$math_score), main="Density of Math Score",
xlab="Math Score", ylab="Density")
# Scatterplot of math_score vs writing scores
plot(data$math_score, data$writing_score,
main="Scatterplot",
xlab="Math score", ylab="Writing score", pch= 19)
# Line plot
plot(data$math_score, data$writing_score, type = "o", main="Line plot",
xlab="Height", ylab="weight", pch=19)
help(grid)
########################################################################
# Qualitative data
########################################################################
# Extract group data
Race_Group <- data$race
# Frequency table of group data
Group <- table(Race_Group)
# Pie chart
help(pie)
pie(Group, main="Pie Chart")
# Basic barplot
help("ggplot")
ggplot(data=Group)
# Constructing a data frame
help("data.frame")
table(Race_Group)
group_label =c("A", "B", "C", "D", "E") # Group names
count =c(89, 190, 319, 262, 140) # count of each group
group_count = data.frame(group_label, count)
# Basic barplot
ggplot(data=group_count, aes(x=group_label, y=count)) +
geom_bar(stat="identity")
help("geom_bar")
# Pareto Chart
#install.packages("qcc")
library(qcc)
pareto.chart(count)
# Extracting gender and race data
x <- subset(data, select = c(gender, race))
table(x)
grp = rep(c("A", "B", "C", "D", "E"),2)
gen = c("F", "F", "F", "F", "F","M","M","M","M","M")
fre = c(36, 104, 180, 129, 69, 53, 86, 139, 133, 71)
df <- data.frame(grp, gen, fre)
# Multiple barplot
# barplot with multiple groups
ggplot(data=df, aes(x=grp, y=fre, fill=gen)) +
geom_bar(stat="identity")
# Use position=position_dodge()
ggplot(data=df, aes(x=grp, y=fre, fill=gen)) +
geom_bar(stat="identity", position=position_dodge())
# Sorting data based on the ascending order of math_score
data_sorted <- data[order(data$math_score),]
# Sorting data based on the descending order of math_score
data_sorted_ascending <- data[order(data$math_score, decreasing = TRUE),]
#########################################################################
######
# visualizing multivariate data:
# "women" dataset is available in R: https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/women.html
help(women)
data <- women
h <- data$height
w <- data$weight
# Scatterplot
plot(women, xlab = "Height (in)", ylab = "Weight (lb)",
main = "women data: American women aged 30-39")
grid(nx = 10, ny = 10)
# Line plot
plot(h, w, type = "o", main="Line plot",
xlab="Height", ylab="weight", pch=19)
help(grid)
#########################################################################
######
##Group Activity: Perform a descriptive analysis for your dataset
and interpret your results.
#Group/Room Dataset Data Description
#1 airquality https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/airquality.html
#2 attenu https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/attenu.html
#3 freeny https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/freeny.html
#4 iris https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/iris.html
#5 quakes https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/quakes.html
#6 rock https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/rock.html
#7 stackloss https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/stackloss.html
#8 swiss https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/swiss.html
# Check the datasets available in R: https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/00Index.html
#########################################################################
######
## Subsetting data
# Extract data between entry 30 and entry 55 for the first 3 variables
data_subset1 <- data[30:55, 1:3]
# Extract data between entry 30 and entry 55 for the first, second, and
fifth variables
data_subset1_1 <- data[30:55, c(1,2,5)]
# Extract data of the male students:
help("subset")
data_subset2 <- subset(data, gender == "male")
# Extract data of the male students who scored more than 75 for
mathematics:
data_subset3 <- subset(data, gender == "male" & math_score > 75)
# Extract data of the male students or students who scored more than 75
for mathematics:
data_subset3_1 <- subset(data, gender == "male" | math_score > 75)
# scenario 1: gender == "male"
# scenario 2: math_score > 75
# scenario 3: gender == "male" and math_score > 75
# Extract scores of the male students who scored more than 75 for
mathematics:
data_subset4 <- subset(data, gender == "male" & math_score > 75,
select = c(math_score, reading_score,
writing_score))
# Alternative method
data_subset3 <- subset(data, gender == "male" & math_score > 75)
#data_subset4_1 <- subset(data_subset3, select = c(math_score,
reading_score, writing_score))
data_subset4_2 <- data_subset3[, 2:4]
# To get a random sample from the data whole data set
set.seed(123) # To generate the same random sample
# Randomly select 40% of the data as a sample
sample_40 <- sample(c(TRUE, FALSE), nrow(DATA), replace=TRUE,
prob=c(0.4,0.6))
SAMPLE <- DATA[sample_40, ]
########################################################################
# Descriptive Statistics 02
########################################################################
# Create a univariate dataset
data <- c(4,10, 5, 8, 7.5, 8, 5, 16.5, 1, 7.8, 8, 10, 11, 18, 15,9, 14,
23, 21, 28)
# Mean
help(mean)
mean(data)
Mean_data <- mean(data)
# Median
help(median)
median(data)
med <- median(data)
MED = median(data)
# Mode
# We use frequency table
table(data)
# Range
max(data) # Maximum
min(data) # Minimum
# Calculating range
Range = max(data) - min(data)
# Standard deviation
help(sd)
sd(data)
# Variance
var(data)
# Coefficient of Variation
# Coefficient of Variation = std dev/mean*100
cv <- sd(data) / mean(data) * 100
# Inter Quartile Range
help("quantile")
Q1 <- quantile(data, c(.25))
quantile(data, c(.25, .75))
IQR(data)
# Five-number summary
summary(data)
# Boxplot
help("boxplot")
boxplot(data)
# Histogram
data <- c(4,10, 5, 8, 7.5, 8, 5, 16.5, 1, 7.8, 8, 10, 11, 18, 15,9, 14,
23, 21, 28)
hist(data)
help(hist)
# Density plot
density(data)
plot(density(data))
# Skewness & kurtosis
#install.packages("moments")
library(moments)
help(skewness)
skewness(data)
kurtosis(data)
## Data set with missing value
# Create a vector.
x <- c(12,7,3,4.2,18,2,54,-21,8,-5,NA)
# Find mean with missing data
help(mean)
mean_WithMissing <- mean(x)
mean_WithMissing
# Find mean dropping missing data (NA entries)
mean_WithOutMissing <- mean(x,na.rm = TRUE)
mean_WithOutMissing
#########################################################################
######
## Perform a descriptive analysis for the "iris" dataset and interpret
your results.
# iris: https://stat.ethz.ch/R-manual/R-
devel/library/datasets/html/iris.html
help(iris)
DATA <- iris
########################################################################
# Poisson Distribution 04
# Content: Density, distribution function, and random generation
# for the Poisson distribution with parameter lambda(=mu).
# Density function: dpois(x, lambda, log = FALSE)
# Distribution function (cumulative probability): ppois(q, lambda,
lower.tail = TRUE, log.p = FALSE)
# Random Generation: rpois(n, lambda)
########################################################################
rm(list=ls()) # Clear the Environment
# Example:
# RDA investigated that there are twelve cars crossing a bridge per
minute on average.
#(a) Find the probability of having
# (i) no cars
# (ii) three or more cars
# (iii) less than 17 cars
# crossing the bridge in a particular minute.
#(b) Plot the probability distribution of No of cars crossing the bridge.
# X = The number of cars crossing the bridge in a particular minute
# x = 0, 1, 2, 3,.....
# Poisson distribution with parameter lambda(=mu).
# X follows a Poisson(lamda = 12) distribution
help(dpois)
#(i) no cars: X=0
x = 0
# (i) P(X = 0)
P_X_0 <- dpois(x=0, lambda=12)
P_X_0
sprintf("P(X = 0) = %s", round(P_X_0, digits = 6))
sprintf("The probability of no cars crossing the bridge in a minute is
%s", round(P_X_0, digits = 6))
#(ii) three or more cars: X >= 3
# (ii) P(X >= 3) = 1 - P(X <= 2)
P_X_geq_3 <- 1 - ppois(2, lambda=12) # lower tail
P_X_geq_3
# P(X >= 3)
P_X_GEQ_3 <- ppois(3, lambda=12, lower=FALSE) # upper tail
P_X_GEQ_3
sprintf("P(X >= 3) = %s", round(P_X_GEQ_3, digits = 4))
#(iii) less than 17 cars
# (iii) P(X <= 17)
P_X_leq_17 <- ppois(17, lambda=12) # lower tail
P_X_leq_17
sprintf("P(X <= 17) = %s", round(P_X_leq_17, digits = 4))
# (b) The Poisson probability distribution plot
x <- 0:20
par(mfrow = c(2, 1))
# Probability density function (pdf)
barplot(dpois(x, lambda=12),col = "red",names.arg=x,
xlab = "X = No of cars crossing the bridge", ylab = "pdf: P(X =
x)",
main="Poisson (mu = 12) pdf")
# Cumulative density function (cdf)
barplot(ppois(x, lambda=12),col = "red",names.arg=x,
xlab = "X = No of cars crossing the bridge", ylab = "cdf:P(X <=
x)",
main="Poisson (mu = 12) cdf")
########################################################################
# Extra: Random generation for a Poisson distribution with parameter
lambda(=mu).
# rpois(n, lambda)
#Create a data set of 30 samples from a Poisson distribution with lambda
= 6.23
set.seed(2) # to get the sample
rpois(n=30, lambda = 6.23)
########################################################################
# Exercise:
# The number of accidents that occur at a busy intersection is Poisson
distributed
# with a mean of 3.5 per week. Find the probability of the following
events:
# (a) Less than three accidents in a week
# (b) Five or more accidents in a week
# (c) No accidents today