library("broom")
library("dplyr")
library("ggplot2")
library("ggrepel")
library("modelsummary")
library("ggdag")
library("purrr")
library("stringr")
library("ggpubr")
#For Maps
library("sf")
library("stars")
library("rnaturalearth")
library("rnaturalearthdata")
library("ggspatial")
library("gganimate")
library("lemon")
#Removing previous files and setting the directory
rm(list = ls())
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week12/lab/")
1 Introduction
In this hypothetical situation (the data is made up), the World Bank is planning on launching a program designed to reduce the risk of malaria in Malawi by providing insecticide-treated bed nets. So they provided insecticide-treated bed nets only to city B from 2017 to 2020. The World Bank selected 24 individuals (over 3 years) from city B and they want to investigate whether receiving such nets has any effect on people’s risk of malaria.
In this document, we want to calculate \(\beta_3\) from:
\[ Y_{it} = \beta_0 + \beta_1 \text{Group}_i + \beta_2 \text{Time}_t +\beta_3 (\text{Group}_i \times \text{Time}_t) + \epsilon_{it} \]
where:
Group = 1 if this is the treatment group
Time = 1 if this is the period after intervention
\(\beta_0\) - mean of the control group in the pre-treatment period
\(\beta_1\) - the change in outcome across groups
\(\beta_2\) - the change in outcome within groups.
\(\beta_3\) - the differences in differences
In our case, this is equivalent to:
\[ \text{Malaria Risk}_{it} = \beta_0 + \beta_1 \text{City B}_i + \beta_2 \text{Post-2017}_t +\beta_3 (\text{City B}_i \times \text{Post-2017}_t) + \epsilon_{it} \]
Preliminaries: Create a folder within the relevant week called “lab”. Within it, create another folder called “data” and another one, called “graphs”.
In a first instance, we want to draw a causal model which explains the effect of the insecticide-treated bed nets on participant risk of malaria, given the time and group. See the DAG models below (together with the code to produce them).
Step 1 Code
<- dagify(malaria_risk ~ net + year + city,
malaria_dag ~ year + city,
net exposure = "net",
outcome = "malaria_risk",
labels = c(malaria_risk = "Malaria Risk",
net = "Mosquito Net",
year = "Year",
city = "City",
pre_malaria_risk = "Pre Malaria Risk"),
coords = list(x = c(malaria_risk = 3, net = 1, year = 2, city = 2),
y = c(malaria_risk = 2, net = 2, year = 3, city = 1)))
<-data.frame(tidy_dagitty(malaria_dag))
bigger_dag $type[bigger_dag$name=="malaria_risk"]<-"Outcome"
bigger_dag$type[bigger_dag$name=="net"]<-"Intervention"
bigger_dag<-min(bigger_dag$x)
min_lon_x<-max(bigger_dag$x)
max_lon_x<-min(bigger_dag$y)
min_lat_y<-max(bigger_dag$y) max_lat_y
Step 2 Code
= c("Outcome"="green3",
col "Intervention"="red",
"Confounder"="grey60")
<-c("Outcome", "Intervention", "Confounder")
order_col
<-ggplot(data = bigger_dag, aes(x = x, y = y, xend = xend, yend = yend, color=type)) +
example_daggeom_dag_point() +
geom_dag_edges() +
coord_sf(xlim = c(min_lon_x-1, max_lon_x+1), ylim = c(min_lat_y-1, max_lat_y+1))+
scale_colour_manual(values = col,
name = "Group",
breaks = order_col)+
geom_label_repel(data = subset(bigger_dag, !duplicated(bigger_dag$label)),
aes(label = label), fill = alpha(c("white"),0.8))+
theme_bw()+
labs(x = "", y="")
#We now save the graph in a folder called "graphs"
ggsave(example_dag, file = "./graphs/example_dag1.jpg",
height = 20, width = 25,
units = "cm", dpi = 200)
example_dag
2 Loading the Data
Go to the following links and download the following:
Place all the data in the data folder.
2.1 Loading the Shape Files
st_read
reads the shapefiles from your hardrive. read_stars
reads a tif file that shows the elevation for your area of interest. ne_countries
loads a shapefile with all the countries around the world.
#Country shapefile
<- st_read("./data/malawi-latest-free/gadm41_MWI_0.shp")
mwi #Region 1 shapefile
<- st_read("./data/malawi-latest-free/gadm41_MWI_1.shp")
mwi1 #Region 3 shapefile
<- st_read("./data/malawi-latest-free/gadm41_MWI_3.shp")
mwi3 #Selected communes shapefile
<- st_read("./data/malawi-latest-free/selected_communes3.shp")
select_comm #Raster with elevation
<- read_stars("./data/elev.tif")
elev #World Shapefile
<- ne_countries(scale = "medium", returnclass = "sf")
world #Switched off spherical geometry (s2)
::sf_use_s2(FALSE) sf
2.2 Simplifying the shapefiles for better rendering
st_simplify
simplifies the polygons for quicker rendering.
#Simplifying polygons for quicker rendering
#Note: you have to play around with dTolerance to figure out the appropriate value
#Too little - your computer will need more time to render the file
#Too much - the country polygons will be too simple
<-st_simplify(mwi, dTolerance = 0.005)
mwi<-st_simplify(mwi1, dTolerance = 0.005)
mwi1<-st_simplify(mwi3, dTolerance = 0.005)
mwi3<-st_simplify(select_comm, dTolerance = 0.0005)
select_comm<-st_simplify(world, dTolerance = 0.005) world
2.3 Calculating the appropriate zoom level for Malawi
We now calculate the zoom level for the entire country of Malawi.
#Calculating the appropriate zoom level
<- st_bbox(mwi)
mwi_extent
<-mwi_extent["xmin"]
min_lon_x<-mwi_extent["xmax"]
max_lon_x
<-mwi_extent["ymin"]
min_lat_y<-mwi_extent["ymax"] max_lat_y
We now create a new polygon to emphasize where Malawi is on the World Map.
#Creating a dataframe to emphasize Malawi and adding some error
= 5
error
= data.frame("lon" = c(min_lon_x - error, max_lon_x + error,
sp_df - error, max_lon_x + error),
min_lon_x "lat" = c(min_lat_y - error, min_lat_y - error,
+ error, max_lat_y + error)) max_lat_y
We now make a spatial polygon out of it so that we can highlight where Malawi is on the world map.
#Creating polygon to indicate zoom
<- sp_df %>%
poly st_as_sf(coords = c("lon", "lat"),
crs = st_crs(mwi)) %>%
st_bbox() %>%
st_as_sfc()
#Examining what we just created
ggplot() +
geom_sf(data = mwi, fill="red", color = "red", linewidth = 0.9)+
geom_sf(data = poly, fill=NA, color = "red", linewidth = 0.3)
2.4 Cropping the raster
We now crop our raster only to the area of interest. Again, if we have too much, our computer will have a hard time rendering the file.
#Step1: Reducing raster resolution: Higher dx/day values mean lower resolution
<-st_warp(elev, st_as_stars(st_bbox(elev), dx = 0.05, dy = 0.05))
elev2#Step2: Removing really high pixel values for good contrast in MWI
> 2000] <- NA
elev2[elev2 #Step3: Cropping
<-st_crop(elev2, poly) dem_mwi
<-ggplot() +
map_wordgeom_sf(data = world, fill=NA, color = "black", linewidth = 0.3)+
geom_sf(data = mwi, fill="red", color = "red", linewidth = 0.7)+
geom_sf(data = poly, fill=NA, color = "red", linewidth = 0.9)+
theme_bw()+
labs(x = "Longitude", y="Latitude")+
ggtitle("Malawi on the World Map")+
theme(axis.text.x = element_text(size=14),
axis.text.y = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
#ggspatial::annotation_scale(location = 'tr')
map_word
ggsave(map_word, file="./graphs/map_world.jpg", height=15, width=30, units = "cm", dpi=300)
2.5 Mapping the region of interest - Malawi
#Selecting only the region of interest
<-subset(mwi1, NAME_1=="Nkhata Bay")
mwi1_lab#Defining colors for raster
<- colorRampPalette(c("grey90", "grey30"))
grey_palette # Generate a sequence of colors based on the number of breaks desired
<- 10 # Adjust this based on your data
num_breaks <- grey_palette(num_breaks) grey_colors
This is the region of Nkhata Bay where the World Bank conducts the study. Within the region of Nkhata Bay, the World Bank chose city B to provide nets and examined its effects from 2017 onwards. We now want to draw a new map to emphasize where Nkhata Bay is.
<-0.2
error<-ggplot() +
map_malawigeom_stars(data=elev2, show.legend=FALSE) +
scale_fill_gradientn(colours = grey_colors)+
geom_sf(data = select_comm, fill="green", color = "green", linewidth = 0.4)+
geom_sf(data = mwi1, fill=NA, color = "blue", linewidth = 0.2)+
geom_sf(data = mwi1_lab, fill=NA, color = "blue", linewidth = 0.9)+
geom_sf_text(data=mwi1_lab, aes(label = NAME_1), size=3.5)+
theme_bw()+
coord_sf(xlim = c(min_lon_x-5*error, max_lon_x+5*error),
ylim = c(min_lat_y-error, max_lat_y+error))+
labs(x = "Longitude", y="Latitude")+
ggtitle("Nkhata Bay in Malawi")+
theme(axis.text.x = element_text(size=14),
axis.text.y = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#legend.position.inside values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))+
::annotation_scale(location = 'tr')
ggspatial#Saving the map
ggsave(map_malawi, file="./graphs/map_malawi.jpg", height=15, width=15, units = "cm", dpi=300)
map_malawi
2.6 Calculating the appropriate zoom level for the Selected Communes
We now calculate the zoom level for the region of interest - Nkhata Bay.
#Calculating the appropriate zoom level for the selected commune
<- st_bbox(select_comm)
mwi_extent
<-mwi_extent["xmin"]
min_lon_x<-mwi_extent["xmax"]
max_lon_x
<-mwi_extent["ymin"]
min_lat_y<-mwi_extent["ymax"] max_lat_y
#Re-reading the shapefiles.
#We now want more accurate polygons since we are zooming in.
#This is why we load them again
<- st_read("./data/malawi-latest-free/gadm41_MWI_0.shp")
mwi <- st_read("./data/malawi-latest-free/gadm41_MWI_1.shp")
mwi1 <- st_read("./data/malawi-latest-free/gadm41_MWI_3.shp")
mwi3 <- st_read("./data/malawi-latest-free/selected_communes3.shp")
select_comm <- read_stars("./data/elev.tif")
dem
#Step1: Reducing raster resolution: Higher dx/day values mean lower resolution
<-st_warp(dem, st_as_stars(st_bbox(elev), dx = 0.02, dy = 0.02))
elev2#Step2: Removing really high pixel values for good contrast in MWI
> 2000] <- NA
elev2[elev2
#Step3: Creating a dataframe to emphasize Malawi and adding some error
= 0.5
error = data.frame("lon" = c(min_lon_x - error, max_lon_x + error,
sp_df - error, max_lon_x + error),
min_lon_x "lat" = c(min_lat_y - error, min_lat_y - error,
+ error, max_lat_y + error))
max_lat_y
#Step4: Creating polygon to indicate zoom
<- sp_df %>%
poly st_as_sf(coords = c("lon", "lat"),
crs = st_crs(mwi)) %>%
st_bbox() %>%
st_as_sfc()
#Step5: Cropping
<-st_crop(elev2, poly)
dem_mwi
<- ne_countries(scale = "medium", returnclass = "sf")
world #Turning off spherical geometries
::sf_use_s2(FALSE) sf
#We also want to include roads in our map, to provide more context.
#Reading the OSM roads data
<-st_read("./data/malawi-latest-free/gis_osm_roads_free_1_crop2.shp")
roads#Step1: Dissolving polylines
<- st_combine(roads)
roads2 #Step2: Turning roads2 into an sf object
<-st_as_sf(roads2)
roads3#Step3: Only the roads of interest
<- sf::st_intersection(roads3, select_comm) roads_crop1
2.7 Mapping the Individuals
Let us first look at our data.
<- read.csv(file = './data/did_data_geo_malawi.csv')
diff_data glimpse(diff_data, n=10)
Rows: 8,000
Columns: 10
$ year <int> 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2013, 201…
$ income <dbl> 1284.551, 1285.832, 1285.997, 1285.157, 1286.313, 1287.24…
$ age <int> 33, 34, 35, 36, 37, 38, 39, 40, 51, 52, 53, 54, 55, 56, 5…
$ sex <chr> "Woman", "Woman", "Woman", "Woman", "Woman", "Woman", "Wo…
$ malaria_risk <dbl> 36.26529, 35.10382, 73.17664, 28.98390, 51.18721, 51.9071…
$ id <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, …
$ lat <dbl> -11.27476, -11.27476, -11.27476, -11.27476, -11.27476, -1…
$ lon <dbl> 34.03006, 34.03006, 34.03006, 34.03006, 34.03006, 34.0300…
$ after <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
$ city <chr> "City A", "City A", "City A", "City A", "City A", "City A…
<-0.05
error<-ggplot() +
figure_animatedgeom_sf(data = mwi1_lab, fill=NA, color = "blue", linewidth = 0.9)+
geom_point(data = diff_data, aes(x=lon, y=lat, color = city, size = malaria_risk, alpha=0.3))+
theme_bw()+
labs(x = "Longitude", y="Latitude")+
ggtitle("The Three Districts Selected in Nkhata Bay, Malawi\n Location of 1000 Individuals\n For the Experiment")+
theme(axis.text.x = element_text(size=14),
axis.text.y = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12),
legend.position="bottom")+
coord_sf(xlim = c(min_lon_x-3*error, max_lon_x+3*error), ylim = c(min_lat_y-error, max_lat_y+error))+
::annotation_scale(location = 'tr')+
ggspatiallabs(title = 'Year: {frame_time}', x = 'lon', y = 'lat')+
transition_states(year)+
transition_time(as.integer(year))
figure_animated
3 Data Analysis
We will now do the data analysis. Until now, we looked at various ways of mapping our data.
3.1 Summary statisics
Before we conduct any analysis, it is important to get a sense of our data, the minimum and maximum, the mean, and whether our variables are normally distributed or not. Let us now create a summary statistics table:
datasummary_skim(diff_data)
Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
---|---|---|---|---|---|---|---|---|
year | 8 | 0 | 2016.5 | 2.3 | 2013.0 | 2016.5 | 2020.0 | |
income | 8000 | 0 | 1249.9 | 160.6 | 836.3 | 1246.8 | 1710.7 | |
age | 87 | 0 | 32.3 | 16.0 | 1.0 | 31.0 | 87.0 | |
malaria_risk | 8000 | 0 | 48.9 | 12.7 | 0.0 | 49.0 | 100.0 | |
id | 1000 | 0 | 500.5 | 288.7 | 1.0 | 500.5 | 1000.0 | |
lat | 1000 | 0 | -11.2 | 0.1 | -11.3 | -11.2 | -11.1 | |
lon | 1000 | 0 | 34.1 | 0.1 | 34.0 | 34.1 | 34.2 | |
after | 2 | 0 | 0.4 | 0.5 | 0.0 | 0.0 | 1.0 | |
N | % | |||||||
sex | Man | 3224 | 40.3 | |||||
Woman | 4776 | 59.7 | ||||||
city | City A | 7808 | 97.6 | |||||
City B | 192 | 2.4 |
Income and malaria risk seem to be normally distributed. The net variable is binary, as expected. Age seems to be skewed to the right. This means thatmore values are concentrated on the left side of the distribution graph while the right tail of the distribution graph is longer.
3.2 Estimating the Difference
So, we are interested in the causal effect of the program - \(\beta_3 (\text{Group}_i \times \text{Time}_t)\). We can find this causal effect by calculating:
\[ Y_{it} = \beta_0 + \beta_1 \text{Group}_i + \beta_2 \text{Time}_t + \beta_3 (\text{Group}_i \times \text{Time}_t) + \epsilon_{it} \]
We can calculate \(\beta_3 (\text{Group}_i \times \text{Time}_t)\) by running:
<- lm(malaria_risk ~ city + after + city * after, data = diff_data)
model_did options(modelsummary_format_numeric_latex = "plain")
modelsummary(model_did, stars = TRUE, gof_map = c("nobs", "r.squared"))
(1) | |
---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
(Intercept) | 50.629*** |
(0.179) | |
cityCity B | 3.071** |
(1.155) | |
after | -4.532*** |
(0.292) | |
cityCity B × after | -7.623*** |
(1.886) | |
Num.Obs. | 8000 |
R2 | 0.034 |
Let us now also plot this difference. This is also called an “event-study” plot.
<- diff_data %>%
plot_data group_by(year, city) %>%
summarize(mean_risk = mean(malaria_risk),
se_risk = sd(malaria_risk) / sqrt(n()),
upper = mean_risk + (1.96 * se_risk),
lower = mean_risk + (-1.96 * se_risk))
Let us now investigate what we just produced.
head(plot_data)
We can now make an event-study plot. This is meant to illustrate the cumulative effect of the policy on the outcome. The effect of the policy must be measured with reference to some baseline. In our case this is city A.
<-ggplot(plot_data, aes(x = year, y = mean_risk, color = city)) +
mean_riskgeom_vline(xintercept = 2017.5) +
geom_errorbar(aes(ymin = lower, ymax = upper),
size = 1, width = 0,
position=position_dodge(width=0.04))+
geom_line() +
geom_point(size = 2, position=position_dodge(width=0.04))+
labs(x = "Year", y = "Malaria Risk")+
scale_y_continuous(breaks = (seq(40, 57, by = 3)),
limits = c(40, 57))+
scale_x_continuous(breaks = (seq(2013, 2020, by = 1)),
limits = c(2012, 2021))+
theme_bw() +
theme(legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank())
#Repositioning legend
<-reposition_legend(mean_risk, 'bottom left') mean_risk
#Saving the picture
ggsave(mean_risk, file = "./graphs/fig_dif_dif.jpg",
height = 12, width = 20,
units = "cm", dpi = 200)
We can see that parallel trends hold prior to the intervention. Regions in both cities A and B have a decreasing trend in the risk of malaria. After the intervention, it seems that people from city B have an even lower risk of malaria.
4 Conclusion
We can now conclude that indeed the provision of insecticide-treated nets reduces the risk of malaria by -7.62 units. Thus, if the we want to reduce such a health risk to the wider region, it might be effective to provide such nets to all vulnerable populations.