Assignment HTML
Mohammed Hamad
2024-07-16
{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
Introduction
This document contains a series of tasks that are part of the “Introduction to R
Programming for Data Science” course offered by IBM on Coursera. These tasks serve as
practical exercises to reinforce the concepts covered in the course modules.
R Markdown
Course Information
• Course Name: Introduction to R Programming for Data Science
• Provider: IBM on Coursera
Objective
The primary objective of these tasks is to provide hands-on experience with R
programming techniques essential for data science. Each task is designed to cover specific
aspects of data manipulation, visualization, and analysis using R.
Structure of the Document
This document consists of 10 tasks. Each task focuses on a different aspect of R
programming and data science concepts taught in the course. The tasks are designed to be
completed sequentially, following the course modules.
To install Packages:
#install.packages("httr")
#install.packages("rvest")
To load the Library
library(httr)
library(rvest)
TASK 1: Get a COVID-19 pandemic Wiki page using HTTP request
To write the Get function:
get_wiki_covid19_page <- function(url, param) {
query_param <- list(title=param)
response <- GET (url , query=query_param)
return(response)
}
To call get_wiki_covid19_page
get_wiki_covid19_page("https://en.wikipedia.org/w/
index.php","Template:COVID-19_testing_by_country")
Task2: Extract COVID-19 testing data table from the wiki HTML page
Now use the read_html function in rvest library to get the root html node from
response
library(rvest)
url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-
19_testing_by_country"
root_node <- read_html(url)
root_node
Get the tables in the HTML root node using html_nodes function
table_node <- html_nodes(root_node, "table")
table_node
Notice we need to call number [2]; which is wikitable
Hint:- Please read the table_node with index 2(ex:- table_node[2]).
data_frame <- as.data.frame(html_table(table_node[2]))
head(data_frame)
TASK 3: Pre-process and export the extracted data frame
The goal of task 3 is to pre-process the extracted data frame from the previous
step, and export it as a csv file
Let’s get a summary of the data frame
summary(data_frame)
preprocess_covid_data_frame <- function(data_frame) {
shape <- dim(data_frame)
# Remove the World row
data_frame<-data_frame[!
(data_frame$`Country.or.region`=="World"),]
# Remove the last row
data_frame <- data_frame[1:172, ]
# We dont need the Units and Ref columns, so can be removed
data_frame["Ref."] <- NULL
data_frame["Units.b."] <- NULL
# Renaming the columns
names(data_frame) <- c("country", "date", "tested", "confirmed",
"confirmed.tested.ratio", "tested.population.ratio",
"confirmed.population.ratio")
# Convert column data types
data_frame$country <- as.factor(data_frame$country)
data_frame$date <- as.factor(data_frame$date)
data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
data_frame$confirmed <-
as.numeric(gsub(",","",data_frame$confirmed))
data_frame$'confirmed.tested.ratio' <-
as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
data_frame$'tested.population.ratio' <-
as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
data_frame$'confirmed.population.ratio' <-
as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
return(data_frame)
}
Call the preprocess_covid_data_frame function
proper_data_frame <- preprocess_covid_data_frame(data_frame)
head(proper_data_frame)
Get the summary of proper_data_frame
summary(proper_data_frame)
To save this file under this name: covid_19(2024)
write.csv(proper_data_frame, file = 'covid-
19(2024).csv',row.names=FALSE)
To check if its available:
# Get working directory
wd <- getwd()
# Get exported
file_path <- paste(wd, sep="", "/covid19(2024).csv")
# File path
print(file_path)
file.exists(file_path)
##TASK 4: Get a subset of the extracted data frame ##The goal of task 4 is to get the 5th to
10th rows from the data frame with only country and confirmed columns selected
#Read covid_data_frame_csv from the csv file
#read.csv("covid-19(2023).csv")
covid_19 <- read.csv("covid-19(2024).csv")
covid_data <- as.data.frame(covid_19)
# Get the 5th to 10th rows, with two "country" "confirmed" columns
covid_data[5:10,c('country','confirmed')]
TASK 5: Calculate worldwide COVID testing positive ratio
The goal of task 5 is to get the total confirmed and tested cases worldwide, and
try to figure the overall positive ratio using confirmed cases / tested cases
# Get the total confirmed cases worldwide
total_confirmed <- sum(covid_data[,'confirmed'])
total_confirmed
# Get the total tested cases worldwide
total_tested <- sum(covid_data[,'tested'])
total_tested
# Get the positive ratio (confirmed / tested)
positive_ratio <- total_confirmed/total_tested
positive_ratio
round(positive_ratio,2)
TASK 6: Get a country list which reported their testing data
The goal of task 6 is to get a catalog or sorted list of countries who have
reported their COVID-19 testing data
# Get the `country` column
covid_data[,'country']
# Check its class (should be Factor)
class(covid_data$country)
# Convert the country column into character so that you can easily
sort them
covid_data$country <- as.character(covid_data$country)
class(covid_data$country)
# Sort the countries A to Z
sort(covid_data$country)
# Sort the countries Z to A
desc_country <- sort(covid_data$country, decreasing=TRUE)
# Print the sorted Z to A list
print(desc_country)
TASK 7: Identify countries names with a specific pattern
The goal of task 7 is using a regular expression to find any countires start with
United
# Use a regular expression `United.+` to find matches
country_matches <- regexpr('United.+', covid_data$country)
# Print the matched country names
regmatches(covid_data$country, country_matches)
TASK 8: Pick two countries you are interested, and then review their testing
data
The goal of task 8 is to compare the COVID-19 test data between two countires,
you will need to select two rows from the dataframe, and select country,
confirmed, confirmed-population-ratio columns
# Select a subset (should be only one row) of data frame based on a
selected country name and columns
jordan <-
covid_data[covid_data$country=='Jordan',c('country','tested','confirme
d','confirmed.population.ratio')]
# Select a subset (should be only one row) of data frame based on a
selected country name and columns
united_states <- covid_data[covid_data$country=='United
States',c('country','tested','confirmed','confirmed.population.ratio')
]
jordan
united_states
I added this code to make it eaisier; To combine those two tables:
# Extract data for Jordan
jordan <- covid_data[covid_data$country == 'Jordan', c('country',
'tested', 'confirmed', 'confirmed.population.ratio')]
# Extract data for United States
united_states <- covid_data[covid_data$country == 'United States',
c('country', 'tested', 'confirmed', 'confirmed.population.ratio')]
# Combine the two data frames vertically
combined_data <- rbind(jordan, united_states)
# Print the combined data
print(combined_data)
Comparative Analysis of COVID-19 Testing and Confirmed Cases: United States
vs. Jordan
#difference in testing
united_states$tested > jordan$tested
#difference in confirmed
united_states$confirmed > jordan$confirmed
TASK 9: Compare which one of the selected countries has a larger ratio of
confirmed cases to population
The goal of task 9 is to find out which country you have selected before has
larger ratio of confirmed cases to population, which may indicate that country
has higher COVID-19 infection risk
# Use if-else statement
if (united_states$confirmed.population.ratio >
jordan$confirmed.population.ratio) {
print('United States have higher covid-19 risk')
} else {
print('Jordan has higher covid-19 risk')
}
TASK 10: Find countries with confirmed to population ratio rate less than a
threshold
The goal of task 10 is to find out which countries have the confirmed to
population ratio less than 1%, it may indicate the risk of those countries are
relatively low
# Get a subset of any countries with `confirmed.population.ratio` less
than the threshold
new_df <- covid_data[(covid_data$`confirmed.population.ratio` < 1), ]
new_df