DataQuest Project
Garakishi Guluzade
2024-09-24
R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF,
and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the
output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
Including Plots
You can also embed plots, for example:
1
800
600
pressure
400
200
0
0 50 100 150 200 250 300 350
temperature
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that
generated the plot.
This is a project for DataQuest course. We will try to find out “Which countries have reported the highest
number of positive cases in relation to the number of tests conducted?” for Covid - 19 pandemic.
library(tidyverse) library(readr) library(dslabs) install.packages(“tinytex”) tinytex::install_tinytex()
covid19 <- read_csv(“covid19.csv”)
dim(covid19)
vector_cols <- colnames(covid19) vector_cols
head(covid19) glimpse(covid19) view(covid19)
covid_df_all_states <- covid19 %>% filter(Province_State == “All States”) %>% select(-Province_State)
covid_df_all_states
covid_df_all_states_daily <- covid19 %>% select(Date, Country_Region, active, hospitalizedCurr,
daily_tested, daily_positive)
covid_df_all_states_daily
library(dplyr)
Summarizing the data by grouping by ‘Country_Region’
covid_df_all_states_daily_sum <- covid_df_all_states_daily %>% group_by(Country_Region) %>%
# Group rows by the ‘Country_Region’ column summarize( tested = sum(daily_tested, na.rm = TRUE),
2
# Sum of ‘daily_tested’ positive = sum(daily_positive, na.rm = TRUE), # Sum of ‘daily_positive’ active
= sum(active, na.rm = TRUE), # Sum of ‘active’ hospitalized = sum(hospitalizedCurr, na.rm = TRUE)
# Sum of ‘hospitalizedCurr’ ) %>% arrange(desc(tested)) # Arrange the result in descending order by the
‘tested’ column
Display the result
print(covid_df_all_states_daily_sum)
covid_top_10 <- head(covid_df_all_states_daily_sum, 10) covid_top_10
countries <- covid_top_10CountryR egioncountriestestedc ases < −covidt op1 0tested tested_cases posi-
tive_cases <- covid_top_10positivepositivec asesactivec ases < −covidt op1 0active active_cases hospital-
ized_cases <- covid_top_10$hospitalized hospitalized_cases
Assign country names to each vector
names(tested_cases) <- countries names(positive_cases) <- countries names(active_cases) <- countries
names(hospitalized_cases) <- countries
Display the named vectors
print(tested_cases) print(positive_cases) print(active_cases) print(hospitalized_cases)
positive_tested_ratio <- positive_cases / tested_cases positive_tested_ratio
top_3_indexes <- order(positive_tested_ratio, decreasing = T)[1:3] positive_tested_top_3 <- posi-
tive_tested_ratio[top_3_indexes] positive_tested_top_3
#Keeping relevant information united_kingdom <- c(0.11, 1473672, 166909, 0, 0) united_states <- c(0.10,
17282363, 1877179, 0, 0) turkey <- c(0.08, 2031192, 163941, 2980960, 0)
covid_mat <- rbind(united_kingdom, united_states, turkey) covid_mat colnames(covid_mat) <-
c(“Ratio”, “tested”, “positive”, “active”, “hospitalized”) covid_mat
#Putting all together question <- “Which countries have had the highest number of positive cases
against the number of tests?” answer <- c(“Positive tested cases” = positive_tested_top_3) answer
data_list <- list(question = question, answer = answer) data_list # Assuming your dataframes are
named covid_df_all_states_daily and covid_top_10 dataframes_list <- list(covid_df_all_states_daily
= covid_df_all_states_daily, covid_top_10 = covid_top_10)
Display the list of dataframes
print(dataframes_list) # Assuming you have created matrix1 and matrix2 matrices_list <- list(covid_mat
= covid_mat)
Display the matrices list
print(matrices_list)
3
Create a list that contains the vectors
vectors_list <- list( tested_cases = tested_cases, positive_cases = positive_cases, active_cases = ac-
tive_cases, hospitalized_cases = hospitalized_cases, countries = countries )
Create the combined named list
data_structure_list <- list( dataframes = dataframes_list, # From the previous steps matrices = matri-
ces_list, vectors = vectors_list )
Display the final named list
print(data_structure_list)
Display the vectors list
print(vectors_list)
covid_analysis_list <- list(question = question, answer = answer, data_structure_list = data_structure_list)
covid_analysis_list covid_analysis_list[[2]]