install.
packages("wooldridge")
install.packages("ggplot2")
library(wooldridge)
library(ggplot2)
data(bwght)
#QUESTION 1
#1a. Calculate descriptive statistics for birth weight:
# Mean
mean_bwght= mean(bwght$bwght, na.rm=T)
# Median
median_bwght= median(bwght$bwght, na.rm = TRUE)
# Mode (since R does not have a built-in mode function, we'll use a custom function)
getmode= function(v) {
uniqv= unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode_bwght=getmode(bwght$bwght)
# Quartiles
quartiles_bwght=quantile(bwght$bwght, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
# Display results
mean_bwght
median_bwght
mode_bwght
quartiles_bwght
#2a. Average number of cigarettes smoked per day:
avg_smoke= mean(bwght$cigs, na.rm = TRUE)
avg_smoke
#2b. Association between smoking and birth weight:
install.packages("ggplot2")
library("ggplot2")
ggplot(bwght, aes(x = cigs, y = bwght)) +
geom_point(aes(color = factor(cigs))) +
geom_smooth(method = "lm") +
labs(title = "Birth Weight vs. Number of Cigarettes Smoked",
x = "Number of Cigarettes Smoked",
y = "Birth Weight")
#3a. Distribution of family income:
mean_income= mean(bwght$faminc, na.rm = TRUE)
sd_income= sd(bwght$faminc, na.rm = TRUE)
mean_income
sd_income
#3b. Correlation between birth weight and log of family income:
bwght$log_income=log(bwght$faminc)
smoke_data=bwght[bwght$cigs>1, ]
cor_smoke= cor(smoke_data$bwght, smoke_data$log_income, use = "complete.obs")
non_smoke_data= bwght[bwght$cigs == 0, ]
cor_non_smoke= cor(non_smoke_data$bwght, non_smoke_data$log_income, use =
"complete.obs")
cor_smoke
cor_non_smoke
#4a. Mean of fatheduc:
mean_fatheduc= mean(bwght$fatheduc, na.rm = TRUE)
count_fatheduc= sum(!is.na(bwght$fatheduc))
mean_fatheduc
count_fatheduc
#4b. Proportion of mothers who are high school graduates:
prop_highschool= mean(bwght$motheduc >= 12, na.rm = TRUE)
prop_highschool
#4c. Average birth weight by high school graduation status:
bwght$highschool= bwght$mothed >= 12
avg_bwght_highschool= aggregate(bwght ~ highschool, data = bwght, FUN = mean)
ggplot(bwght, aes(x = factor(highschool), y = bwght)) +
geom_boxplot() +
labs(title = "Birth Weight by High School Graduation Status",
x = "High School Graduate",
y = "Birth Weight")
avg_bwght_highschool
#QUESTION 2
data(meap01)
#1
min_read4=min(meap01$read4, na.rm = TRUE)
max_read4= max(meap01$read4, na.rm = TRUE)
difference_read4= max_read4 - min_read4
min_read4
max_read4
difference_read4
#2
#Number and percentage of schools with a perfect pass rate:
perfect_pass_rate_count= sum(meap01$read4 == 100, na.rm = TRUE)
total_schools= nrow(meap01)
percentage_perfect= (perfect_pass_rate_count / total_schools)*100
pass_rate_50_count= sum(meap01$read4 == 50, na.rm = TRUE)
perfect_pass_rate_count
percentage_perfect
pass_rate_50_count
#3
#Compute the average pass rates and visualize:
avg_math4= mean(meap01$math4, na.rm = TRUE)
avg_read4= mean(meap01$read4, na.rm = TRUE)
avg_math4
avg_read4
library(ggplot2)
avg_rates= data.frame(
Subject = c("Math", "Reading"),
Average_Pass_Rate = c(avg_math4, avg_read4)
)
ggplot(avg_rates, aes(x = Subject, y = Average_Pass_Rate, fill = Subject)) +
geom_bar(stat = "identity") +
labs(title = "Average Pass Rates for Math and Reading",
x = "Subject",
y = "Average Pass Rate")
#4 a and b
#Pearson Correlation Coefficient and Scatterplot:
correlation= cor(meap01$math4, meap01$read4, use = "complete.obs")
correlation
ggplot(meap01, aes(x = math4, y = read4)) +
geom_point(aes(color = enroll)) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Scatterplot of Math vs. Reading Pass Rates",
x = "Math Pass Rate",
y = "Reading Pass Rate") +
scale_color_distiller(palette = "Spectral")
#5
#Create categories for school size and calculate average pass rates:
meap01$size_cat= cut(meap01$enroll, breaks = c(-Inf, 300, 600, Inf), labels = c("Small
(<300)", "Medium (300-600)", "Large (>600)"))
avg_pass_rates_by_size <- aggregate(cbind(math4, read4) ~ size_cat, data = meap01, FUN =
mean, na.rm = TRUE)
ggplot(avg_pass_rates_by_size, aes(x = size_cat)) +
geom_bar(aes(y = math4, fill = "Math"), stat = "identity", position = "dodge") +
geom_bar(aes(y = read4, fill = "Reading"), stat = "identity", position = "dodge") +
labs(title = "Average Pass Rates by School Size",
x = "School Size Category",
y = "Average Pass Rate") +
scale_fill_manual(name = "Subject", values = c("Math" = "blue", "Reading" = "red"))
#QUESTION 3
data(wage1)
#1
#Calculate the average educational level, lowest, and highest years of education:
avg_education= mean(wage1$educ, na.rm = TRUE)
min_education= min(wage1$educ, na.rm = TRUE)
max_education= max(wage1$educ, na.rm = TRUE)
avg_education
min_education
max_education
#2
#Determine the average per-hour wage and interpret:
avg_wage= mean(wage1$wage, na.rm = TRUE)
avg_wage
#3
#Calculate the proportion of women and men in the sample:
prop_women= mean(wage1$female, na.rm = TRUE)
prop_men= 1 - prop_women
prop_women
prop_men
#4
#Calculate the probability that a randomly chosen woman is married:
women_data= subset(wage1, female == 1)
prop_married_women= mean(women_data$married, na.rm = TRUE)
prop_married_women
#5a. Association between Hourly Wage and Education:
library(ggplot2)
ggplot(wage1, aes(x = educ, y = wage)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Hourly Wage vs. Education",
x = "Years of Education",
y = "Hourly Wage")
#5b. Hourly Wage and Education by Marital Status:
ggplot(wage1, aes(x = educ, y = wage, color = factor(married))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Hourly Wage vs. Education by Marital Status",
x = "Years of Education",
y = "Hourly Wage",
color = "Marital Status") +
scale_color_manual(values = c("blue", "red"), labels = c("Unmarried", "Married"))
#5c. Hourly Wage and Education by Gender and Marital Status:
ggplot(wage1, aes(x = educ, y = wage, color = factor(married), shape = factor(female)))
+
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Hourly Wage vs. Education by Gender and Marital Status",
x = "Years of Education",
y = "Hourly Wage",
color = "Marital Status",
shape = "Gender") +
scale_color_manual(values = c("blue", "red"), labels = c("Unmarried", "Married")) +
scale_shape_manual(values = c(16, 17), labels = c("Male", "Female"))