Summary Statistics for Single set of Data (with vectors)
Examples for Vectors and Data frame
# Sample vector of daily sales for a retail store (in dollars)
daily_sales <- c(5000, 6200, 4800, 5500, 7200, 6300, 5100, 4800, 5400, 6200, 5800, 7000,
6800, 5500, 6100, 5300, 4700, 5900, 6200, 6500, 7200, 6800, 5600, 4800, 5200, 6100, 5800,
7200, 6900, 5500, 6100)
# Calculate single-value summary statistics
mean_sales <- mean(daily_sales)
median_sales <- median(daily_sales)
std_deviation <- sd(daily_sales)
min_sales <- min(daily_sales)
max_sales <- max(daily_sales)
Summary Statistics with Data frame-(data frame is a type of data structure with two or
more set of data)
NOTE:
There are two ways of writing data frame
1. Writing the datas separately such as
Data1= c(1,2,3,4,5,6,7,8,9,20)
Data2= c(2,4,6,8,10,12,14,16,18,20)
So here there is two set of data and now you want this in table format.
Then the command is
Give any variable name eg:
df= data.frame(Data1=Data1, Data2=Data2) #click enter
df (#click enter)
2. The other way is
df = data.frame("Name" = c("Amiya", "Rosy", "Asish"),"Gender" = c("Male",
"Female", "Male"))
df
(the difference here is instead of writing each data separately you just write directly in
one command)
These are the some of examples
data_frame_data <- data.frame(
Name = c("Alice", "Bob", "Charlie"),
Math = c(85, 92, 78),
Science = c(88, 90, 85),
History = c(75, 82, 90))
#Contingency table
df = data.frame("Name" = c("Amiya", "Rosy", "Asish"),"Gender" = c("Male", "Female",
"Male"))
> table(df)
Output:
Gender
Name Female Male
Amiya 0 1
Asish 0 1
Rosy 1 0
# Sample data
gender <- c("Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female",
"Male", "Female")
brand <- c("Apple", "Samsung", "Samsung", "Apple", "Samsung", "Google", "Apple",
"Google", "Samsung", "Other")
#While using strings it is important to notes that it should have “…..” (inside the
brackets only)
# Create a data frame
data_df <- data.frame(Gender = gender, Brand = brand)
# Using the xtabs() function
cross_tab2 <- xtabs(~ Gender + Brand, data = data_df)
print(cross_tab2)
Importing Data from Excel or CSV File
File searching
Command IN R Software:
getwd()
enter
"C:/Users/Hp/Documents"
data1=read.csv(file.choose()) enter
data1 Enter
COMMANDS FOR DATA ANALYSIS FOR DIFFERENT TESTS
Tests Codes Data
Shapiro-Wilk #summary statistics for channel 1 ch1_data <- c(7, 7,
normality test 8, 8, 9, 10, 11, 11,
summary(ch1_data) 12, 12, 12, 13, 14,
15, 17, 17, 17, 18,
#summary statistics for channel 2
18, 19)
> summary(ch2_data)
ch2_data <- c(36,
# Step 3: Create a histogram 21, 27, 39, 33, 42,
> hist(ch1_data, main = "DeliveryTimes Ch1", 25, 30, 31, 37, 35,
xlab = "Delivery Time (hrs)", col = "lightblue", 29, 23, 34, 41, 23,
border = "black") 32, 32, 30, 39)
> hist(ch2_data, main = "DeliveryTimes Ch2",
xlab = "Delivery Time (hrs)", col = "lightblue",
border = "black")
# Step 4: Density plot and visualize the data
for channel 1 and 2
dens = density(ch1_data)
> plot(dens$x, dens$y)
#For channel 2
> dens = density(ch2_data)
> plot(dens$x, dens$y)
# Step 5: Conduct the Shapiro-Wilk
normality test
shapiro_test_result = shapiro.test(ch1_data)
> print(shapiro_test_result)
ks_test_result = ks.test(ch1_data, ch2_data)
> print(ks_test_result)
# Step 7: Create a QQ plot to visually assess
the goodness-of-fit
> qqnorm(ch1_data)
qqline(ch1_data)
> qqnorm(ch2_data)
> qqline(ch2_data)
# Step 8: Draw qqplot to compare the two
channels
> qp = qqplot(ch1_data,ch2_data)
Student's t-test t_test_result <- t.test(group_new, group_new <- c(80,
group_traditional) 85, 88, 92, 78, 90,
84, 88, 85, 89)
print(t_test_result) group_traditional <-
c(75, 82, 79, 88, 70,
81, 75, 80, 78, 83)
one-sample t- t_test_result <- t.test(sample_data, mu = 75) # sample_data <-
testing mu is the hypothesized population mean c(72, 74, 78, 70, 76,
print(t_test_result) 73, 77, 75, 79, 71,
74, 76, 80, 72, 74,
75, 73, 75, 78, 76,
73, 74, 76, 77, 75,
72, 78, 74, 76, 75)
(assume population
mean’s test score is
75.
#Two sample T- # Perform a two-sample t-test with unequal group1 <- c(22, 24,
test with unequal variances (Welch's t-test) 25, 28, 26)
variance (by t_test_result <- t.test(group1, group2, var.equal group2 <- c(30, 32,
default it = FALSE) 31, 35, 33)
assumes equal
variance) # Print the results
print(t_test_result)
(R has equal variance built in function already,
but in this analysis we are going with unequal
variances so while using the above command it
is important to write var.equal=FALSE
Incase you want equal variance then the
command would be var.equal=TRUE
one-tailed paired t_test_result <- t.test(after_training, before_training <-
samples t-test before_training, alternative = "greater") c(50, 55, 48, 52, 45,
print(t_test_result) 47, 53, 49, 51, 50)
after_training <-
c(58, 62, 55, 60, 54,
56, 61, 57, 59, 58)
two-sample result = wilcox.test(after, before, paired = after= c(4, 3, 4, 2, 3)
paired Wilcoxon TRUE) before= c(6, 7, 8, 5,
U-test print(result) 7)
#Covariance data <- data.frame(
(Without xlfile) Student = 1:10,
Hours_Studied = c(2, 3, 1, 4, 5, 2, 3, 1, 4, 5),
Exam_Score = c(65, 75, 60, 80, 90, 70, 75, 55,
85, 95)
)
# Calculate the covariance between
Hours_Studied and Exam_Score
covariance_matrix <- cov(data$Hours_Studied,
data$Exam_Score)
# Print the covariance
>covariance_matrix
#Correlation # Calculate the Pearson correlation tv_ad_spend <-
(Without xl file) coefficient c(5000, 5500, 6000,
>correlation_coefficient <- cor(tv_ad_spend, 5500, 5800, 6200,
sales_revenue, method = "pearson") 6500, 7000, 7500,
7200)
# Print the correlation coefficient >sales_revenue <-
>correlation_coefficient c(75000, 78000,
82000, 76000,
80000, 84000,
87000, 91000,
95000, 93000)
#Test for # Create a data frame with demographic data
Association >data <- data.frame(
using Gender = c("Male", "Female", "Male",
chi_squared_test "Female", "Male", "Female", "Male", "Female",
(without xl file) "Male", "Female"),
Education_Level = c("High School",
"College", "High School", "College",
"Graduate", "High School", "College",
"Graduate", "High School", "Graduate")
)
# Create a contingency table (cross-
tabulation) of the two variables
>contingency_table <- table(data$Gender,
data$Education_Level)
#Check how the contigency table looks like
>contingency_table
# Perform a chi-squared test for
independence
>chisq.test(contingency_table)
#Test for data1=read.csv(file.choose())
Association > data1
using
chi_squared_test
(with xl file)
# Step2: Generate a cross contingency table
summing up total respondent for each price-
rating combination
> contingency_table
=xtabs(Number.of.respondents ~ Price + Rating,
data = data1)
> contingency_table
# Step 4: Run chi-squared test to test the
Hypothesis
> chisq.test(contingency_table)
One Way students_data= read.csv(file.choose())
ANOVA > students_data
(with xlfile) #Step 2: visualization of mean using Boxplot
boxplot(Test_Score ~ Teaching_Method, data =
students_data, col = "lightblue", pch = 18, main
= "Distribution of Test Scores by Teaching
Method", xlab = "Teaching Method", ylab =
"Test Score")
#Step 3: One way ANOVA
anova_result <- aov(Test_Score ~
Teaching_Method, data = students_data)
> summary(anova_result)
#Step 4: Tukey HSD post hoc testing
tukey_results <- TukeyHSD(anova_result)
> print(tukey_results)
Two Way #Step 1: load the CSV file and organize it into
ANOVA data frames.
GTL=read.csv(file.choose())
> GTL
# Step 2: Using box plot, visualizing light vs
temperature for different glass types.
boxplot(Light ~ Temp * Glass, data = GTL, col
= c("lightblue", "lightgreen"), main = "Boxplot
of Light vs Temperature for Different Glass
Types",xlab = "Temperature",ylab = "Light")
# Step 3: Formulate a hypothesis about the
effect of glass type and temperature on light
output. Run Two way ANOVA.
anova_result= aov(Light ~ Glass * Temp, data =
GTL)
> summary(anova_result)
# Step 4: Conduct Post-hoc testing
TukeyHSD(anova_result)
Linear Step 1: load the data and organize it into data
Regression frames.
> height= c(65, 62, 60, 64, 68, 70, 68, 65)
> weight= c(75, 70, 65, 72, 75, 80, 72, 64)
> student_data= data.frame(Height=height,
Weight=weight)
> student_data
#Create a simple regression of Weight vs
Height
reg=
lm(student_data$Weight~student_data$Height)
> summary(reg)
#Find correlation coefficient and intercept
correlation=cor(student_data$Height,
student_data$Weight)
summary(correlation)
Multiple sales_data= data.frame(Sales = sales, sales =c(10, 15, 12,
Regression Advertising = advertising, Pricing = pricing, 18, 20, 22, 25, 28,
Competitor_Pricing = competitor_pricing) 39, 32)
> sales_data > advertising=c(5, 6,
6, 8, 10, 12, 15, 16,
Note: in case you want to input the data into
18, 20)
data structures(vectors etc) that is without
xlfiles, for regression or any kind of analysis > pricing= c(20, 18,
which involves more columns in such cases 16, 15, 14, 13, 12,
you need to convert the vector in a data 11, 10, 9)
frame.
>
# Step 2: Create a regression model
competitor_pricing=
> reg_model=lm(Sales ~ Advertising + Pricing +
c(18, 17, 16, 16, 15,
Competitor_Pricing, data = sales_data)
14, 13, 12, 11, 10)
> reg_model
> summary(reg_model)