#Removing previous datasets in memory
rm(list = ls())
#Loading the relevant libraries
library("ggplot2")
library("gridExtra")
library("dplyr")
library("purrr")
#For Maps
library("sf")
library("rnaturalearth")
library("rnaturalearthdata")
library("ggrepel")
library("modelsummary")
library("broom")
#For Animation
library("gifski")
library("av")
library("magick")
library("gganimate")
1 Loading the Data
Let us go back to the old Life Expectancy and urbanization datasets. If you don’t have them anymore, you can download them from the following links:
#Setting path
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week9/lab/")
#Step1: Loading the data
<- read.csv(file = './data/life-expectancy.csv')
life_expectancy_df <- read.csv(file = './data/share-of-population-urban.csv') urbanization_df
2 Cleaning the Data
In the next few lines we average life expectancy over country (the original dataset is a panel - with countries and years). This means that we are getting rid of the time component.
#Step1: Selecting after 1900
<-subset(life_expectancy_df, Year>1900)
life_expectancy_df#Step2: Calculating the mean
<-life_expectancy_df%>%
life_expectancy_df2::group_by(Entity, Code)%>%
dplyr::summarize(life_exp_mean=mean(Life.expectancy.at.birth..historical.))
dplyr#Step3: Cleaning the Data
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(life_expectancy_df2, !(Code %in% weird_labels))
clean_life_expectancy_dfhead(clean_life_expectancy_df, n=5)
#Step1: Calculating the mean
<-urbanization_df%>%
urbanization_df2::group_by(Code)%>%
dplyr::summarize(urb_mean=mean(Urban.population....of.total.population.))
dplyr#Step2: Cleaning the Data
<- c("OWID_KOS", "OWID_WRL", "")
weird_labels <-subset(urbanization_df2, !(Code %in% weird_labels))
clean_urbanization_dfhead(clean_urbanization_df, n=5)
We are now left merge urbanization on life expectancy
#Step1: Performing the left_merge
<-left_join(clean_life_expectancy_df, clean_urbanization_df, by = c("Code"="Code"))
merged_data#Step2: Getting rid of the NAs
<-subset(na.omit(merged_data))
merged_data2#Step3: Sorting by Entity name
<- merged_data2[order(merged_data2$Entity),]
merged_data2 glimpse(merged_data2)
Rows: 214
Columns: 4
Groups: Entity [214]
$ Entity <chr> "Afghanistan", "Albania", "Algeria", "American Samoa", "…
$ Code <chr> "AFG", "ALB", "DZA", "ASM", "AND", "AGO", "ATG", "ARG", …
$ life_exp_mean <dbl> 45.38333, 68.28611, 57.53013, 68.63750, 77.04861, 45.084…
$ urb_mean <dbl> 18.61175, 40.44416, 52.60921, 79.76249, 87.04302, 37.539…
Let us now calculate the correlation between life expectancy and urbanization using the two ways we learned.
3 Correlation between urbanization and life expectancy
As we discussed in the previous lab, we can calculate correlations using the cor
command. This is given by the following formula:
\[\rho = r = \frac{\frac{\Sigma(X_{i} - \bar{X}) (Y_i - \bar{Y})}{n}}{s_X * s_Y}\]
<-cor(merged_data2$urb_mean, merged_data2$life_exp_mean, method = c("pearson"))
cor_coeff cor_coeff
[1] 0.6906849
We can use the correlation coefficient to calculate the b, following the formula:
\[b = r \frac{s_Y}{s_X}\]
<-cor_coeff*(sd(merged_data2$life_exp_mean)/sd(merged_data2$urb_mean))
b_res b_res
[1] 0.2515915
We can also obtain the same result by running a regression
<-lm(life_exp_mean~urb_mean, data=merged_data2)
modelsummary(model)
Call:
lm(formula = life_exp_mean ~ urb_mean, data = merged_data2)
Residuals:
Min 1Q Median 3Q Max
-14.371 -3.929 -0.529 4.562 18.623
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 48.96112 1.02706 47.67 <2e-16 ***
urb_mean 0.25159 0.01809 13.91 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.398 on 212 degrees of freedom
Multiple R-squared: 0.477, Adjusted R-squared: 0.4746
F-statistic: 193.4 on 1 and 212 DF, p-value: < 2.2e-16
The way we interpret b is the following: every unit increase in urbanization is associated with an increase in life expectancy of 0.25. This effect is highly statistically significant.
4 Creating a regression table
This is how we create a professionally-looking table.
<-lm(life_exp_mean~urb_mean, data=merged_data2)
model
<-list("Life Expectancy" = model)
models
<- c('urb_mean'='Urbanization',
cm "(Intercept)"="Intercept")
modelsummary(models, stars = TRUE, coef_map = cm, gof_map = c("nobs", "r.squared"))
Life Expectancy | |
---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
Urbanization | 0.252*** |
(0.018) | |
Intercept | 48.961*** |
(1.027) | |
Num.Obs. | 214 |
R2 | 0.477 |
5 Creating a coefficient plot
We will now use a new function - tidy
, which will allow us to save the regression coefficients in a dataframe.
<- tidy(model)
results results
We can plot these coefficients in the following way:
#To calculate the confidence interval, we use 1.96,
#meaning that 95% of the area under a normal curve lies
#within approximately 1.96 standard deviations from the mean
ggplot(results,
aes(x = estimate, y = term)) +
geom_point()+
geom_errorbar(aes(xmin = estimate-1.96*std.error,
xmax = estimate+1.96*std.error),
linewidth = 1, width=0)
There is only one issue: the value of the intercept is way too high compared to the value of urbanization. This is why we only see a dot for ubanization and a longer line for the intercept. To fix this problem, we will scale our variables. This means that we substract the mean and divide it by the standard deviation.
<-lm(scale(life_exp_mean)~scale(urb_mean), data=merged_data2)
model_scaledsummary(model_scaled)
Call:
lm(formula = scale(life_exp_mean) ~ scale(urb_mean), data = merged_data2)
Residuals:
Min 1Q Median 3Q Max
-1.62816 -0.44519 -0.05994 0.51680 2.10984
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.770e-15 4.955e-02 0.00 1
scale(urb_mean) 6.907e-01 4.967e-02 13.91 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7249 on 212 degrees of freedom
Multiple R-squared: 0.477, Adjusted R-squared: 0.4746
F-statistic: 193.4 on 1 and 212 DF, p-value: < 2.2e-16
Let us now compare the two models.
<-lm(life_exp_mean~urb_mean, data=merged_data2)
model1<-lm(scale(life_exp_mean)~scale(urb_mean), data=merged_data2)
model2<-list("Unscaled" = model1,
models"Scaled" = model2)
<- c('urb_mean'='Urbanization',
cm 'scale(urb_mean)' = 'Urbanization',
"(Intercept)"="Intercept")
modelsummary(models, stars = TRUE, coef_map = cm, gof_map = c("nobs", "r.squared"))
Unscaled | Scaled | |
---|---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
Urbanization | 0.252*** | 0.691*** |
(0.018) | (0.050) | |
Intercept | 48.961*** | 0.000 |
(1.027) | (0.050) | |
Num.Obs. | 214 | 214 |
R2 | 0.477 | 0.477 |
Let us now try to redraw the coefficients.
#Step1: Save the lm results to a dataframe
<-lm(scale(life_exp_mean)~scale(urb_mean), data=merged_data2)
model2<-tidy(model2) results
#Step2: Plot the results
ggplot(results, aes(x = estimate, y = term)) +
geom_point()+
geom_errorbar(aes(xmin = estimate-1.96*std.error,
xmax = estimate+1.96*std.error),
linewidth = 1, width=0)
6 Calculating predicted values and residuals
To calculate the predicted values and residuals, we will simply work with our dataframe.
head(merged_data2, n=10)
We will first add the \(\beta_0\) (a) and \(\beta_1\) (b) to our dataframe.
$b<-model1$coefficients["urb_mean"]
merged_data2$a<-model1$coefficients[1]
merged_data2head(merged_data2, n=10)
Let us now compute the predicted values for life expectancy.
$y_hat<-merged_data2$b*merged_data2$urb_mean + merged_data2$a
merged_data2head(merged_data2, n=10)
Let us now plot all the y_hats and ys and let us make the y_hats red triangles. In order to do that, we need to split the sample into two and add them on top of each other. We will have the same x. Y will be Y-hat (predicted Y) and Y (observed Y).
<-subset(merged_data2, select = c(Code, life_exp_mean, urb_mean))
sample1$type<-"Observed"
sample1head(sample1, n=5)
<-subset(merged_data2, select = c(Code, y_hat, urb_mean))
sample2$type<-"Predicted"
sample2names(sample2)[names(sample2)=="y_hat"]<-"life_exp_mean"
head(sample2, n=5)
Now we bind the two dataframes by rows. This means that the two dataframes are added (stacked) on top of each other.
<-rbind(sample1, sample2)
sample_both# sort by Code
<- sample_both[order(sample_both$Code),]
sample_both head(sample_both)
Now, we are ready to make the graph.
<-ggplot(sample_both, aes(x=urb_mean, y=life_exp_mean,
figure1color = type, shape=type)) +
scale_color_manual(values=c('black', "red"))+
scale_shape_manual(values=c(16, 17))+
geom_point(size = 3) +
geom_smooth(method='lm', se=FALSE)+
theme_bw()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(30,100))+
theme(axis.text.x = element_text(size=14),
axis.text.y = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
figure1
We can now calculate the residuals: residuals are the difference between y_hat and y.
$y_resid<-merged_data2$life_exp_mean-merged_data2$y_hat merged_data2
What are the countries that deviate the most from our prediction? Let’s say that we want to emphasize in a scatter plot the 4th quartile countries that deviate the most from our prediction. To do that we go back to our merged dataset. Note that the residuals can be both positive and negative, which is why we take the absolute value.
summary(abs(merged_data2$y_resid))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.001473 2.186840 4.077296 5.115602 7.644769 18.622716
We can now create a new variables in the dataset that captures whether countries are in that upper quartile deviation.
#Creating a variable that has "normal" as values
$deviation<-"normal"
merged_data2#Create threshold for deviation
<-summary(abs(merged_data2$y_resid))[5]
threshold#Changing the "normal" to "high" if the residual is higher than a specific value
$deviation[abs(merged_data2$y_resid)>threshold]<-"high"
merged_data2#Creating a new variable - "name_deviation" to indicate the name of the countries which deviate
$name_deviation[abs(merged_data2$y_resid)>threshold]<-merged_data2$Entity
merged_data2#Examinging the first 7 entries
head(merged_data2, n=7)
We can now plot to see those deviations
ggplot(merged_data2, aes(x=urb_mean, y=life_exp_mean,
color = deviation, shape=deviation)) +
scale_color_manual(values=c('red', "black"))+
scale_shape_manual(values=c(18, 16))+
geom_point(size = 3) +
#Plotting the regression line
geom_segment(aes(x = 0, y = model1$coefficients[1],
xend = 100,
yend = model1$coefficients[["urb_mean"]] * max(merged_data2$urb_mean) +model1$coefficients[1]), color = "blue")+
#Adding the name of the country deviations
geom_text(aes(label = name_deviation),
size = 2,
check_overlap = TRUE, position = position_nudge(y = 3))+
theme_bw()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(30,100))+
theme(axis.text.x = element_text(size=14),
axis.text.y = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
7 Urbanization and life expectancy over time
Up until now, we took the average for both urbanization and life expectancy and we worked very little with the values of these two variables over time.
Let us examine the relationship between life expectancy and urbanization in 1970, 1990, 2000, and 2020
We first go back to our unaveraged datasets. We first clean them.
#Removing the countries that have no CODE
<-subset(life_expectancy_df, life_expectancy_df$Code!="")
life_expectancy_df2<-subset(urbanization_df, urbanization_df$Code!="")
urbanization_df2#Renaming variable
names(life_expectancy_df2)[4]<-"life_exp_yearly"
names(urbanization_df2)[4]<-"urb_yearly"
#Removing "Entity" from urbanization_df2
<-subset(urbanization_df2, select =-c(Entity)) urbanization_df2
We now join them based on Code and Year.
<-left_join(life_expectancy_df2, urbanization_df2, by = c("Code"="Code", "Year"="Year")) merged_df_year
Let us now select the relevant years and plot them.
<-subset(merged_df_year, Year==1970)
merged_1970<-subset(merged_df_year, Year==1990)
merged_1990<-subset(merged_df_year, Year==2000)
merged_2000<-subset(merged_df_year, Year==2020) merged_2020
ggplot(merged_1970, aes(x=urb_yearly, y=life_exp_yearly)) +
geom_point()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(0,100))+
geom_smooth(method = "lm", se = FALSE)+
ggtitle("1970")+
theme_bw()
ggplot(merged_1990, aes(x=urb_yearly, y=life_exp_yearly)) +
geom_point()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(0,100))+
geom_smooth(method = "lm", se = FALSE)+
ggtitle("1990")+
theme_bw()
ggplot(merged_2000, aes(x=urb_yearly, y=life_exp_yearly)) +
geom_point()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(0,100))+
geom_smooth(method = "lm", se = FALSE)+
ggtitle("2000")+
theme_bw()
ggplot(merged_2020, aes(x=urb_yearly, y=life_exp_yearly)) +
geom_point()+
scale_x_continuous(name = "urbanization", breaks=seq(0,100,20),
limits = c(0,100))+
scale_y_continuous(name = "life_expectancy", breaks=seq(0,100,20),
limits = c(0,100))+
geom_smooth(method = "lm", se = FALSE)+
ggtitle("2020")+
theme_bw()
What can we say about the relationship between urbanization and life expectancy over time? It seems that it flattens out. But what if we created more graphs that shows this over time. We can create a gif that allows us to see the evolution of the relationship between urbanization and life expectancy over time.
library(gganimate)
library(gifski)
library(av)
library(magick)
<-subset(merged_df_year, Year>1950)
merged_year_1950on# Create the base plot
<- ggplot(merged_year_1950on, mapping = aes(x = urb_yearly, y = life_exp_yearly)) +
figure_animated geom_point() +
theme_bw() +
geom_smooth(method = "lm", se = FALSE) +
scale_x_continuous(breaks = seq(0, 100, by = 20), limits = c(0, 100)) +
scale_y_continuous(breaks = seq(0, 100, by = 20), limits = c(0, 100)) +
labs(title = 'Year: {frame_time}', x = 'urbanization', y = 'life expectancy')+
transition_states(Year) +
transition_time(as.integer(Year))
figure_animated
The animation really allows us to see that countries become more urbanized over time and as a result, the relationship between urbanization and life expectancy flattens out.