Summary
Coronavirus disease 2019 is a new coronavirus that can be spread from person to person. This virus was first detected in Wuhan, China and was first reported to WHO (World Health Organization) on 31 December, 2019. A global pandemic was declared on 11 March, 2020.
The dataset used in this project can be found on a publicly available data repository which is created and maintained by the Johns Hopkins University Center for Systems Science and Engineering. At the time of this project, the dataset is current as of 11 July, 2020. This data is being updated daily, code can be reproduced to reflect the most current data.
Preparations
Packages
Load pre-defined R packages: “tidyverse”, “scales”, “reactable”, “htmltools”, “gganimate”, “ggthemes”, and “coronavirus”.
library(tidyverse)
library(scales)
# data frames in coronavirus package are in tibble format
library(coronavirus)
library(reactable)
library(htmltools)
library(gganimate)
library(ggthemes)
Main Data
The following interactive table provides a general information on our main dataset, which is current as of 11 July, 2020. Viewers can browse more observations by utilizing the pagination feature and/or the search box at the upper right of the table.
# To view the coronavirus dataset
reactable::reactable(coronavirus, minRows = 5, searchable = TRUE)
At a Glance
Notice the dataset consists of 3 different types: “confirmed”, “death”, and “recovered”. We can take a look at how the number of cases in each 3 types changes over time.
#the different types in main dataset
unique(coronavirus$type)
## [1] "confirmed" "death" "recovered"
# data frame of interest
total_cases_vs_date <- coronavirus %>%
group_by(date, type) %>%
summarize(total_cases = sum(cases))
#plotting total_cases vs Date for these 3 types
ggplot(total_cases_vs_date, aes(date,total_cases)) +
geom_line() +
facet_wrap(~type, ncol=1,scales="free_y") +
my_theme +
ylab("Total cases")

Confirmed Cases
Worldwide confirmed cases
The following graph sugguests that sometime after March, the total number of worldwide confirmed cases rises almost linearly over time and it does not show any signs of slowing down. We will examine whether the increasing behavior is linear in next sections.
# extract the data of interest from the main coronavirus dataset
confirmed_cases_worldwide <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(date) %>%
summarize(cases = sum(cases)) %>%
mutate(cum_cases = cumsum(cases)) %>%
select(date, cum_cases)
# to visualize how quick the covid spread in the world
ggplot(data = confirmed_cases_worldwide, aes(date, cum_cases)) +
geom_line() +
my_theme

China vs. the World
The graph implies that China seemed to be able to slow down the coronavirus as comparing to the rest of the world. The cumulative confirmed cases of non-China countries seem to increase at a steadily rate.
Notice there was a jump in China’s confirmed cases line on 13 Frebruary, 2020. This is actually due to the change in their reporting figures on that day, that instead of lab tests, CT scans were accepted as evidence of the COVID-19.
# extract dataset of interest
confirmed_cases_china_vs_world <- coronavirus %>%
filter(type == "confirmed") %>%
mutate(is_china = case_when(
country == "China" ~ "China",
country != "China" ~ "Not China"
)) %>%
select(is_china, date, cases) %>%
group_by(is_china, date) %>%
summarize(cases = sum(cases)) %>%
mutate(cum_cases = cumsum(cases))
# total number of confirmed cases
frmLast <- confirmed_cases_china_vs_world %>%
slice(which.max(date))
plt_cum_confirmed_cases_china_vs_world <- ggplot(data = confirmed_cases_china_vs_world) +
geom_line(aes(date,cum_cases, group = is_china, color = is_china)) +
geom_point(data = frmLast, aes(date,cum_cases), col = "orange", shape = 21, fill = "white", size = 2, stroke = 1.7, show.legend=FALSE) +
geom_text(data = frmLast, aes(date, cum_cases, label = cum_cases, size = 0.5, vjust = 1.5, hjust = 0.8),show.legend = FALSE) +
my_theme
# see the plot with events
plt_cum_confirmed_cases_china_vs_world +
geom_vline(data = who_events, aes(xintercept = date), linetype = "dashed") +
geom_text(data = who_events, aes(x = date, label = event), y = 5e6, size = 3)

China’s confirmed cases in detail
In this section, we want to see how the trend lines change before and after the COVID-19 being declared as a pandemic. To include China’s reporting-figure change, we will look at the confirmed cases in two time frames: between 2020-02-15 and 2020-03-11, and after 2020-03-11.
china_after_feb15 <- confirmed_cases_china_vs_world %>%
filter(is_china == "China" & date >= "2020-02-15")
china_feb15_mar11 <- china_after_feb15 %>%
filter(date <= "2020-03-11")
china_after_mar11 <- china_after_feb15 %>%
filter(date >= "2020-03-11")
ggplot(china_after_feb15, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(data = china_feb15_mar11,method = "lm", se = FALSE, size = 0.7, aes(color = "blue")) +
geom_smooth(data = china_after_mar11,method = "lm", se = FALSE, size = 0.7, aes(color = "red")) +
my_theme +
scale_color_identity(name = "Trend Line",
labels = c("Between Feb15 and Mar11", "After Mar11"),
guide = "legend")

How about the rest of the world?
The following graph shows how the two trend lines of non-China countires differ before and after 11 March, 2020.
not_china <- confirmed_cases_china_vs_world %>%
filter(is_china != "China")
not_china_feb15_mar11 <- not_china %>%
filter(date >= "2020-02-15" & date <= "2020-03-11")
not_china_after_mar11 <- not_china %>%
filter(date >= "2020-03-11")
ggplot(data = not_china, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(data = not_china_feb15_mar11, method = "lm", se=FALSE, size = 0.7, aes(color = "blue")) +
my_theme +
geom_smooth(data = not_china_after_mar11, method = "lm", se = FALSE, size = 0.7, aes(color = "red")) +
scale_color_identity(name = "Trend Line",
labels = c("Between Feb15 and Mar11", "After Mar11"),
guide = "legend")

When we plot worldwide confirmed cases, the graph seems to suggest that the increasing behavior is very close to linear. In fact, the following graph, especially on non-China countires, shows that the spread is actually increasing at an exponential rate.
plt_not_china_trend_line_all <- ggplot(data = not_china, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth( method = "lm", se=FALSE, size = 0.7) +
my_theme +
scale_y_log10(labels = comma_format())
plt_not_china_trend_line_all

Top 10 hardest hit countries
The table below shows the top 10 countires that are affected the most due to COVID-19.
# extract data frame of interest
confirmed_cases_by_country <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(date, country, province) %>%
summarize(cases = sum(cases)) %>%
select(country, province, date, cases) %>%
group_by(country) %>%
mutate(cum_cases = cumsum(cases))
# extract top 10 countries with hardest hit
confirmed_cases_top10 <- confirmed_cases_by_country %>%
group_by(country) %>%
summarize(total_cases = max(cum_cases)) %>%
top_n(10) %>%
arrange(desc(total_cases))
# view them in a table
confirmed_cases_top10
## # A tibble: 10 x 2
## country total_cases
## <chr> <int>
## 1 US 3245925
## 2 Brazil 1839850
## 3 India 849522
## 4 Russia 719449
## 5 Peru 322710
## 6 Chile 312029
## 7 Mexico 295268
## 8 United Kingdom 290504
## 9 South Africa 264184
## 10 Iran 255117
Plotting Top 5 hardest hit countries
The following gives an visualization of top 5 countries that are affected the most and their trajectories.
# confirmed cases top 5
confirmed_cases_top5_country <- confirmed_cases_by_country %>%
filter(country %in% confirmed_cases_top10$country[1:5]) %>%
select(country, date, cum_cases)
ggplot(data = confirmed_cases_top5_country, aes(x = date, y = cum_cases, color = country, group = country)) +
geom_line() +
my_theme

Death Cases
Top 25 Death counts by country
The following graph gives the top 25 countries that have the most number of deaths due to COVID-19 and their death counts.
top_25_death_count <- coronavirus %>%
filter(type == "death") %>%
group_by(country) %>%
summarize(total_death = sum(cases)) %>%
arrange(desc(total_death)) %>%
top_n(25)
top_25_death_count$country = factor(top_25_death_count$country, levels = top_25_death_count$country)
top_25_death_count$angle = 1:25 * 360/25
plt_top_25_death_count <- ggplot(top_25_death_count, aes(country, total_death, fill = total_death)) +
geom_col(width = 1, color = 'grey90') +
geom_col(aes(y=I(5)), width=1, fill='grey90', alpha = 0.2) +
geom_col(aes(y=I(3)), width=1, fill='grey90', alpha = 0.2) +
geom_col(aes(y=I(2)), width=1, fill = "white") +
scale_y_log10() +
scale_fill_gradientn(colors = c("darkgreen","green","orange","firebrick","red"),trans ="log") +
geom_text(aes(label = paste(country, total_death, sep= "\n"),
y = total_death * 0.5, angle= angle),
data = function(top_25_death_count) top_25_death_count[top_25_death_count$total_death > 8000,],
size = 2.5, color = "white", fontface = "bold", vjust =1) +
geom_text(aes(label = paste0(total_death, " death ", country),
y =max(total_death)*1.5, angle = angle+90),
data = function(top_25_death_count) top_25_death_count[top_25_death_count$total_death < 8000,],
size = 2.5, vjust = 0.5) +
coord_polar(direction=-1) +
theme_void() +
theme(legend.position="none")
plt_top_25_death_count

Overall changes over time
The United States ranks the top 1 country with the most death counts. The following chart shows how death counts change from 30 March, 2020 to the current date for the top 25 countries.
death_count<- coronavirus %>%
filter(type == "death") %>%
group_by(country,date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_death = cumsum(cases))
death_count_data <- death_count %>%
filter(country %in% top_25_death_count$country)
df.death_count_data <- death_count_data %>%
filter(date >= "2020-03-30") %>%
group_by(date) %>%
mutate(ordering = rank(total_death)) %>%
ungroup()
plt_death_count_data <-ggplot(df.death_count_data, aes(ordering,group = country, fill = country)) +
geom_tile(aes(y = total_death/2,
height = total_death,
width = 0.8), alpha = 0.9, size =0.8) +
geom_text(aes(y = total_death, label = paste(total_death)), hjust =-.4,size = 3)+
geom_text(aes( y = 0, label = paste(country)), hjust = 1, size = 3) +
coord_flip(clip = "off", expand = FALSE) +
scale_color_viridis_d(name = "")+
scale_fill_viridis_d(name="")+
scale_y_continuous() +
theme_tufte(10, "Avenir") +
guides(color=F, fill=F) +
theme(plot.title = element_text(hjust = 0, size = 15),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.title.y = element_blank(),
axis.ticks = element_blank(),
plot.margin = margin(2,2,2,4,"cm")) +
labs(title = "Date: {frame_time}", subtitle = "Top 25 Countries", y = "Death count") +
transition_time(date) +
ease_aes('cubic-in-out')
animate(plt_death_count_data, fps=10, duration = 15, end_pause = 30, rewind = FALSE)

Recovered Cases
the Percent of daily recovered cases
Here we are interested in how the percentage of daily recovered cases changes over time, we calculate this percentage as:
\[
\begin{aligned}
recover\;percent_{i}=\frac{total\; recoverd\;cases_{i}}{total_\;confirmed\;cases_{i}}\:\:\:i\in dataset\;date\;range \:\:for\;each\;country
\end{aligned}
\]
The following chart shows how the percentage changes over time for the top 5 countries that were hitted the hardest. Notice the start tracking dates are different for each country as they are in respective to their dates where the first confirmed cases were found.
#cumulative confirmed cases by date for each country
confirmed_country_date <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country, date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_confirmed = cumsum(cases)) %>%
select(country, date, total_confirmed)
confirmed_country_date_top5 <- confirmed_country_date %>%
filter(country %in% confirmed_cases_top10$country[1:5])
#cumulative recovered cases by date for each country
recovered_country_date <- coronavirus %>%
filter(type == "recovered") %>%
group_by(country, date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_recovered = cumsum(cases)) %>%
select(country, date, total_recovered)
recovered_country_date_top5 <- recovered_country_date %>%
filter(country %in% confirmed_cases_top10$country[1:5])
#final data frame of interest
recover_percent_tbl <- inner_join(confirmed_country_date_top5, recovered_country_date_top5, by = c("date", "country")) %>%
mutate(recover_percent = total_recovered/total_confirmed) %>%
filter(total_confirmed !=0)
#plot
plt_recover_percent <- ggplot(recover_percent_tbl, aes(x = date, y = recover_percent, group = country, color = country)) +
geom_line() +
scale_y_continuous(labels=scales::percent) +
scale_color_viridis_d() +
theme_classic() +
labs(x = "Dates", y = "Daily Recovered Percentage") +
theme(legend.position = "top") +
geom_point() +
transition_reveal(date)
animate(plt_recover_percent, rewind = FALSE, end_pause= 30)

Case Counts and Percantages
#cumulative confirmed cases by date for each country
confirmed_country <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_confirmed= cases) %>%
select(country, ttl_confirmed)
recovered_country <- coronavirus %>%
filter(type == "recovered") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_recovered= cases) %>%
select(country, ttl_recovered)
death_country <- coronavirus %>%
filter(type == "death") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_death= cases) %>%
select(country, ttl_death)
final_tbl <- confirmed_country %>%
inner_join(recovered_country, by = "country") %>%
mutate(recovered_pct = as.numeric(format(round(ttl_recovered/ttl_confirmed,4), nsmall = 4))) %>%
inner_join(death_country, by = "country") %>%
mutate(death_pct = as.numeric(format(round(ttl_death/ttl_confirmed,4), nsmall = 4))) %>%
filter(ttl_confirmed !=0) %>%
select(country, ttl_confirmed, ttl_recovered, ttl_death, recovered_pct, death_pct) %>%
arrange(desc(ttl_confirmed))
Chart for summary table
The following interactive table shows the percentage of the recovered cases and the percentage of death cases for each country.
bar_chart <- function(label, value, height = "20px", fill = "#00bfc4", background = NULL){
width <- paste0(value * 100, "%")
bar <- div(style = list(background = fill, width = width, height = height))
chart <- div(style = list(flexGrow = 1, align= "right",background = background), bar)
div(style=list(display = "flex", align = "right"),chart, label)
}
reactable::reactable(final_tbl, minRows = 5, searchable = TRUE,
columns = list(
ttl_confirmed = colDef(align = "center",format = colFormat(separators = TRUE)),
ttl_recovered = colDef(align = "center",format = colFormat(separators = TRUE)),
ttl_death = colDef(align = "center",format = colFormat(separators = TRUE)),
recovered_pct = colDef(align = "center", cell = function(value){
label <- paste0(round(value*100,digits = 2),"% ")
bar_chart(label, value, background = "#e1e1e1")
},
),
death_pct = colDef(align="center", cell = function(value){
label <- paste0(round(value*100, digits = 2),"% ")
bar_chart(label, value, fill = "#fc5185", background = "#e1e1e1")
}
)
)
)