# Removing previous files and setting the directory
rm(list = ls())
setwd("/Users/bgpopescu/Dropbox/john_cabot/teaching/stats/week12/")
library("tidyverse") # For dplyr, ggplot, and friends
library("ggdag") # For plotting DAGs
library("dagitty") # For working with DAG logic
library("ggrepel")
library("modelsummary")
library("lemon")
library("MatchIt")
library("broom")
1 DAGs
DAGs stand for directed acyclic graphs. These are theoretical models that generated the data. This type of graph tells you what you should control for to isolate and identify a causal effect. The easiest way to quickly build DAGs and find adjustment sets and testable implications is to use dagitty.net. It is easiest if you draw your diagrams on dagitty.net and then in R using the ggdag
and dagitty
packages. You can go to the Launch DAGitty online in your browser section.
1.1 Loading relevant packages
The general process for making and working with DAGs in R is to create a DAG object with dagify()
and then plot it with ggdag()
. The syntax for dagify()
is similar to lm()
. For example, if we think that y is caused by a, b, and c, we would write y ~ a + b + c
. Similarly, if we think that a is caused by b and c, we would write a ~ b + c
. If y ~ a + b + c
and a ~ b + c
are true at the same time, then b and c are called confounders.
1.2 Creating a simple DAG
# Creating a basic DAG
<- dagify(
simple_dag ~ a + b + c,
y ~ b + c)
a
ggdag(simple_dag)
We can have more control over the layout of our graph by using the tidy_dagitty
function and transforming our dag object into a dataframe. We are setting the seed to obtain the same layout if we repeat the process.
set.seed(1234)
<-data.frame(tidy_dagitty(simple_dag))
bigger_dag
ggplot(data = bigger_dag, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_dag_point() +
geom_dag_edges() +
geom_label_repel(data = subset(bigger_dag, !duplicated(bigger_dag$name)),
aes(label = name), fill = alpha(c("white"),0.8))
We can also distinguish among confounders, outcomes, and intervention
set.seed(1234)
# Creating a basic DAG
<- dagify(
simple_dag ~ a + b + c,
y ~ b + c,
a exposure = "a",
outcome = "y")
ggdag_status(simple_dag)
Of course, we can change the node names and labels:
<- dagify(
dag_with_var_names ~ a + b + c,
y ~ b + c,
a exposure = "a",
outcome = "y",
labels = c(y = "Outcome", a = "Treatment",
b = "Confounder 1", c = "Confounder 2"))
ggdag_status(dag_with_var_names,
use_labels = "label", text = FALSE) +
theme_dag()
You can have better control of the layout if you mannually alter the position of your dots. The easiest thing to do is to draw the diagram in daggity and the export it into R, as shown below.
<- dagitty('dag {
model_from_dagitty bb="-2.821,-1.677,2.821,1.677"
a [exposure,pos="0.235,-1.270"]
b [pos="-1.198,-0.470"]
c [pos="1.612,-0.448"]
y [outcome,pos="0.246,0.244"]
a -> y
b -> a
b -> y
c -> a
c -> y
}')
ggdag_status(model_from_dagitty) +
theme_dag()
Notice that each node has an X (horizontal) and Y (vertical) coordinates. You can play around with these to obtain a different layout.
1.3 A specific example
In previous sessions, we looked at the insecticide-treated net intervention. We argued that the risk of malaria is determined by five different factors: (1) whether people use mosquito nets, (2) their age, (3) their sex, (4) their income, and (5) their predisposition to malaria. At the same time, we also made the point that whether people use nets or not is determined by (1) their income and (2) their predisposition to malaria.
You can see below the DAG code that shows these relationships.
<- dagify(post_malaria_risk ~ net + age + sex + income + pre_malaria_risk,
malaria_dag ~ income + pre_malaria_risk,
net exposure = "net",
outcome = "post_malaria_risk",
#Labels
labels = c(post_malaria_risk = "Post Malaria Risk",
net = "Mosquito Net",
age = "Age",
sex = "Sex",
income = "Income",
pre_malaria_risk = "Pre Malaria Risk"),
#Manual coordinates
coords = list(x = c(net = 2, post_malaria_risk=7, income = 5, age = 2,
sex = 4, pre_malaria_risk=6),
y = c(net = 3, post_malaria_risk=2, income = 1, age = 2,
sex = 4, pre_malaria_risk=4)))
In order to make use of the ggplot
functionalities, I am turning the dagify object into a dataframe.
#Making a dataframe
<-data.frame(tidy_dagitty(malaria_dag))
bigger_dag $type<-"Confounder"
bigger_dag$type[bigger_dag$name=="post_malaria_risk"]<-"Outcome"
bigger_dag$type[bigger_dag$name=="net"]<-"Intervention"
bigger_dag
#Identifying the min and max coordinates
<-min(bigger_dag$x)
min_lon_x<-max(bigger_dag$x)
max_lon_x<-min(bigger_dag$y)
min_lat_y<-max(bigger_dag$y) max_lat_y
Finally, I am now creating the graph.
= c("Outcome"="green3",
col "Intervention"="red",
"Confounder"="grey60")
<-c("Outcome", "Intervention", "Confounder")
order_col
<-ggplot(data = bigger_dag, aes(x = x, y = y, xend = xend, yend = yend, color=type)) +
example_daggeom_dag_point() +
geom_dag_edges() +
coord_sf(xlim = c(min_lon_x-1, max_lon_x+1),
ylim = c(min_lat_y-1, max_lat_y+1))+
scale_colour_manual(values = col,
name = "Group",
breaks = order_col)+
geom_label_repel(data = subset(bigger_dag, !duplicated(bigger_dag$label)),
aes(label = label), fill = alpha(c("white"),0.8))+
theme_bw()+
labs(x = "", y="")
example_dag
2 Matching
In our malaria example, we did not have to worry about confounders because our treatment was randomized. Our dag looked like this:
Show the code
<- dagify(post_malaria_risk ~ net + age + sex + income + pre_malaria_risk,
malaria_dag2 exposure = "net",
outcome = "post_malaria_risk",
labels = c(post_malaria_risk = "Post Malaria Risk",
net = "Mosquito Net",
age = "Age",
sex = "Sex",
income = "Income",
pre_malaria_risk = "Pre Malaria Risk"),
coords = list(x = c(net = 2, post_malaria_risk=7, income = 5, age = 2,
sex = 4, pre_malaria_risk=6),
y = c(net = 3, post_malaria_risk=2, income = 1, age = 2,
sex = 4, pre_malaria_risk=4)))
Show the code
<-data.frame(tidy_dagitty(malaria_dag2))
bigger_dag $type<-"Confounder"
bigger_dag$type[bigger_dag$name=="post_malaria_risk"]<-"Outcome"
bigger_dag$type[bigger_dag$name=="net"]<-"Intervention"
bigger_dag<-min(bigger_dag$x)
min_lon_x<-max(bigger_dag$x)
max_lon_x<-min(bigger_dag$y)
min_lat_y<-max(bigger_dag$y) max_lat_y
Show the code
= c("Outcome"="green3",
col "Intervention"="red",
"Confounder"="grey60")
<-c("Outcome", "Intervention", "Confounder")
order_col
<-ggplot(data = bigger_dag, aes(x = x, y = y, xend = xend, yend = yend, color=type)) +
example_dag2geom_dag_point() +
geom_dag_edges() +
coord_sf(xlim = c(min_lon_x-1, max_lon_x+1), ylim = c(min_lat_y-1, max_lat_y+1))+
scale_colour_manual(values = col,
name = "Group",
breaks = order_col)+
geom_label_repel(data = subset(bigger_dag, !duplicated(bigger_dag$label)),
aes(label = label), fill = alpha(c("white"),0.8))+
theme_bw()+
labs(x = "", y="")
example_dag2
Let us now assume that we no longer deal with an RCT and that people get to choose whether they use an insecticide-treated net or not. Now, people choose whether they want the mosquito net or not. Our DAG looks like this:
Show the code
<- dagify(post_malaria_risk ~ net + age + sex + income + pre_malaria_risk,
malaria_dag ~ income + pre_malaria_risk,
net exposure = "net",
outcome = "post_malaria_risk",
labels = c(post_malaria_risk = "Post Malaria Risk",
net = "Mosquito Net",
age = "Age",
sex = "Sex",
income = "Income",
pre_malaria_risk = "Pre Malaria Risk"),
coords = list(x = c(net = 2, post_malaria_risk=7, income = 5, age = 2,
sex = 4, pre_malaria_risk=6),
y = c(net = 3, post_malaria_risk=2, income = 1, age = 2,
sex = 4, pre_malaria_risk=4)))
<-data.frame(tidy_dagitty(malaria_dag))
bigger_dag $type<-"Confounder"
bigger_dag$type[bigger_dag$name=="post_malaria_risk"]<-"Outcome"
bigger_dag$type[bigger_dag$name=="net"]<-"Intervention"
bigger_dag<-min(bigger_dag$x)
min_lon_x<-max(bigger_dag$x)
max_lon_x<-min(bigger_dag$y)
min_lat_y<-max(bigger_dag$y) max_lat_y
Show the code
= c("Outcome"="green3",
col "Intervention"="red",
"Confounder"="grey60")
<-c("Outcome", "Intervention", "Confounder")
order_col
<-ggplot(data = bigger_dag, aes(x = x, y = y, xend = xend, yend = yend, color=type)) +
example_daggeom_dag_point() +
geom_dag_edges() +
coord_sf(xlim = c(min_lon_x-1, max_lon_x+1), ylim = c(min_lat_y-1, max_lat_y+1))+
scale_colour_manual(values = col,
name = "Group",
breaks = order_col)+
geom_label_repel(data = subset(bigger_dag, !duplicated(bigger_dag$label)),
aes(label = label), fill = alpha(c("white"),0.8))+
theme_bw()+
labs(x = "", y="")
example_dag
We thus want to explain the risk of malaria, as a function of income and whether people use a mosquito net (treatment) or not (control). In a regression formula, we try to estimate:
\[ \text{Malaria Risk} = \beta_0 + \beta_1 \text{Income} + \beta_2 \text{Treatment} \]
We next run the model in r.
2.1 Loading the Data
Create a folder called “lab” within that week’s folder. Within it, create another folder and call it “data”. Go to the following links and download the following:
Place all the data in the data folder.
Let us first look at our data.
<- read.csv(file = './data/rct_data_geo_malawi.csv')
rct_data glimpse(rct_data, n=10)
Rows: 1,000
Columns: 10
$ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
$ lat <dbl> -11.27476, -11.15300, -11.13967, -11.25544, -11.1777…
$ lon <dbl> 34.03006, 34.14594, 34.14297, 34.14619, 34.20031, 34…
$ income <dbl> 409.4701, 520.8072, 581.3331, 324.0727, 532.1844, 53…
$ age <int> 10, 35, 7, 43, 45, 51, 24, 17, 38, 69, 10, 21, 32, 3…
$ sex <chr> "Man", "Woman", "Woman", "Woman", "Man", "Woman", "M…
$ health <dbl> 82.78928, 81.18602, 80.57399, 82.84023, 87.40737, 57…
$ net <int> 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1…
$ malaria_risk_pre <dbl> 35.74454, 36.65260, 22.87415, 35.41934, 27.49873, 42…
$ malaria_risk_post <dbl> 42.52199, 34.27589, 32.67552, 43.87256, 27.16945, 41…
2.2 Running a regression
We can now run a regression with income and net on the right-hand side and malaria risk on the left-hand side.
<-lm(malaria_risk_post~income + net, data=rct_data)
m1modelsummary(m1, stars = TRUE, gof_map = c("nobs", "r.squared"))
(1) | |
---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
(Intercept) | 68.138*** |
(1.816) | |
income | -0.045*** |
(0.004) | |
net | -10.125*** |
(0.533) | |
Num.Obs. | 1000 |
R2 | 0.342 |
The interpretation for the two coefficients is the following:
1. For every unit increase in income (e.g. for every dollar), the risk of malaria infection decreases by 0.045 units, holding everything else constant.
2. People who use mosquito nets, have on average a lower risk of infection with malaria of 10.125 units, holding everything else constant.
2.3 Creating a scatterplot for the two groups.
We will now create a scatterplot showing the relationship between income and malaria risk and identifying the individuals who use nets. vs. not.
#Step1: Adding a new character variable determining whether people use nets (treatment) or not (control).
$type<-"Control\n Don't Use Nets"
rct_data$type[rct_data$net==1]<-"Treatment\n Use Nets"
rct_data
#Step2: Defining colors for the two groups
<- c("Control\n Don't Use Nets" = "black", "Treatment\n Use Nets" = "red")
cols
#Step3: Running a regression with the whole data
<-lm(malaria_risk_post~income, data=rct_data)
x#Saving the intercept
<-x$coefficients[1]
a#Saving the slope
<-x$coefficients[2]
b#summary(x)
#Saving the intercept
<-x$coefficients[1]
a a
(Intercept)
62.64059
summary(x)
Call:
lm(formula = malaria_risk_post ~ income, data = rct_data)
Residuals:
Min 1Q Median 3Q Max
-29.4974 -6.9580 0.0078 7.0141 27.5299
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.640585 2.091456 29.95 <2e-16 ***
income -0.044662 0.004153 -10.75 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.819 on 998 degrees of freedom
Multiple R-squared: 0.1038, Adjusted R-squared: 0.1029
F-statistic: 115.6 on 1 and 998 DF, p-value: < 2.2e-16
modelsummary(x, stars = TRUE, gof_map = c("nobs", "r.squared"))
(1) | |
---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
(Intercept) | 62.641*** |
(2.091) | |
income | -0.045*** |
(0.004) | |
Num.Obs. | 1000 |
R2 | 0.104 |
#Step4: Creating the scatterplot
<-ggplot(rct_data, aes(x=income, y=malaria_risk_post, group = type)) +
figure2geom_point(aes(shape=type, color=type), size=2)+
scale_shape_manual(name="", values = c(16, 17))+
scale_color_manual(name="", values = cols)+
geom_segment(aes(x = min(income), y = a, xend = max(income), yend = b * max(income) + a), color = "black")+
theme_bw()+
scale_x_continuous(name = "income")+
scale_y_continuous(name = "malaria risk") +
theme(axis.text.x = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
<-reposition_legend(figure2, 'bottom left') figure2
We can also create separate lines for the two groups.
#Step4: Running a regression with the people who do not use mosquito nets
<-lm(malaria_risk_post~income, data = subset(rct_data, net==0))
x1#Saving the intercept
<-x$coefficients[1]
a1#Saving the slope
<-x$coefficients[2]
b1
#Step5: Running a regression with the people who use mosquito nets
<-lm(malaria_risk_post~income, data=subset(rct_data, net==1))
x2#Saving the intercept
<-x2$coefficients[1]
a2#Saving the slope
<-x2$coefficients[2] b2
<-ggplot(rct_data, aes(x=income, y=malaria_risk_post, group = type)) +
figure3geom_point(aes(shape=type, color=type), size=2)+
scale_shape_manual(name="", values = c(16, 17))+
scale_color_manual(name="", values = cols)+
#geom_segment(aes(x = min(income), y = a, xend = max(income), yend = b * max(income) + a), color = "black")+
geom_segment(aes(x = min(income), y = a1, xend = max(income), yend = b1 * max(income) + a1), color = "black")+
geom_segment(aes(x = min(income), y = a2, xend = max(income), yend = b2 * max(income) + a2), color = "red")+
theme_bw()+
scale_x_continuous(name = "income")+
scale_y_continuous(name = "malaria risk") +
theme(axis.text.x = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
<-reposition_legend(figure3, 'bottom left') figure3
2.4 Matching based on Mahalanobis Distance
Assignment to treatment is impacted by Income and Pre Malaria Risk:
- People who run a higher risk of getting malaria are more likely to use the net.
- People with higher incomes are more likely to get the net.
Our goal is to find people who are very similar in pre-malaria risk and income, but have slightly different outcomes:
- Some people choose treatment (use the net)
- Some people don’t (not use the net)
We want people who have the same pre-malaria risk and income to choose different outcomes. In the following chunk, we use Nearest Neighbor Matching using Mahalanobis distance.
<-matchit(net~income + malaria_risk_pre,
matched_obdata = rct_data,
method = "nearest",
distance = "mahalanobis",
replace = FALSE)
Warning: Fewer control units than treated units; not all treated units will get
a match.
matched_ob
A matchit object
- method: 1:1 nearest neighbor matching without replacement
- distance: Mahalanobis
- number of obs.: 1000 (original), 956 (matched)
- target estimand: ATT
- covariates: income, malaria_risk_pre
As the message indicates, some observations got dropped while others were not. We can indicate in the scatterplot, which observations got dropped and which were kept.
#Saving the matched data
<-match.data(matched_ob)
matched#Adding an indicator variable
$type_match<-"Kept"
matched#Only keeping the relevant variables
<-subset(matched, select = c(id, type_match))
matched_mer#Left join to the original dataframe, based on id
<-left_join(rct_data, matched_mer, by = c("id" = "id"))
rct2#Creating a new indicator variable pointing out which observations got dropped
$type_match[is.na(rct2$type_match)]<-"Dropped"
rct2$type_match_fin<-rct2$type
rct2$type_match_fin[rct2$type_match=="Dropped"]<-"Dropped"
rct2#Creating a new dataframe if observations were not dropped
#This is important to indicate in ggplot which observations got dropped
<-subset(rct2, type_match_fin!="Dropped")
fixed#Creating a new dataframe if observations were dropped
<-subset(rct2, type_match_fin=="Dropped") fixed_dropped
And now we can create the scatterplot. We mark the observations that got dropped in blue.
<-ggplot(rct2, aes(x=income, y=malaria_risk_pre, group = type_match_fin)) +
figure15geom_point(data = fixed_dropped, aes(shape=type), color="blue", size=4)+
geom_segment(aes(x = min(income), y = a1, xend = max(income), yend = b1 * max(income) + a1), color = "black")+
geom_segment(aes(x = min(income), y = a2, xend = max(income), yend = b2 * max(income) + a2), color = "red")+
geom_point(data = fixed, aes(shape=type_match_fin, color=type_match_fin), size=2)+
scale_shape_manual(name="", values = c(16, 17))+
scale_color_manual(name="", values = cols)+
#geom_line(data = matched_pairs2, aes(group = pair_no), linetype = "dashed", color="blue")+
theme_bw()+
scale_x_continuous(name = "income")+
scale_y_continuous(name = "malaria risk") +
theme(axis.text.x = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
<-reposition_legend(figure15, 'bottom left') figure15
2.5 Comparing Naive Regression and Matched Data
We can now examine how much the coefficient for the net variable changes as a result of the matching procedure.
<-lm(malaria_risk_post~net, data = rct_data)
x1
<-lm(malaria_risk_post~net, data = matched)
x2
<-list("Naive Regression"= x1,
models"Matched"= x2)
modelsummary(models, stars = TRUE, gof_map = c("nobs", "r.squared"))
Naive Regression | Matched | |
---|---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
(Intercept) | 45.662*** | 45.662*** |
(0.415) | (0.417) | |
net | -10.083*** | -10.069*** |
(0.574) | (0.589) | |
Num.Obs. | 1000 | 956 |
R2 | 0.236 | 0.234 |
We now see that the net variable is slightly smaller, as a result of the matching procedure.
2.6 Matching Inverse Probability Weighting
Nearest-neighbor matching can be costly in terms of dropping observations if it’s one on one matching. This can result in very small sample sizes, as a result of throwing away the data. We can also do matching based on propensity score matching weights.
#Step1: Running the logit
<-glm(net~income + malaria_risk_pre, data = rct_data,
model_prefamily = binomial(link = "logit"))
<-list(model_pre)
modelsmodelsummary(models, stars = TRUE, gof_map = c("nobs", "r.squared"))
(1) | |
---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
(Intercept) | 0.213 |
(0.606) | |
income | 0.000 |
(0.001) | |
malaria_risk_pre | -0.001 |
(0.007) | |
Num.Obs. | 1000 |
#Step2: Saving the propensity and renaming the variable
<-augment_columns(model_pre,
rct_ipw
rct_data,type.predict = "response")
names(rct_ipw)[names(rct_ipw)==".fitted"]<-"propensity"
<-subset(rct_ipw, select = -c(.se.fit, .resid, .hat, .sigma, .cooksd, .std.resid)) rct_ipw
We can now calculate the inverse probability weights using the following formula:
\[ \frac{\text{Treatment}}{\text{Propensity}} + \frac{1-\text{Treatment}}{1-\text{Propensity}} \]
$ipw<-rct_ipw$net/rct_ipw$propensity + ((1-rct_ipw$net)/( 1 - rct_ipw$propensity)) rct_ipw
Let us now see the new column.
<-subset(rct_data, select = -c(lat, lon, type, health, malaria_risk_pre))
rct_data2<-subset(rct_ipw, select = -c(lat, lon, type, health, malaria_risk_pre)) rct_ipw2
head(rct_data2, n=5)
head(rct_ipw2, n=5)
We can also put these weights in the scatterplot.
<- c("Control\n Don't Use Nets" = "black", "Treatment\n Use Nets" = "red")
cols
<-subset(rct_ipw2, select = c(id, ipw))
rct_ipw3<-left_join(rct2, rct_ipw3, by = c("id" = "id"))
rct3
<-ggplot(rct3, aes(x=income, y=malaria_risk_pre, group = type_match_fin)) +
figure_ipw#geom_point(data = rct3, aes(shape=type, color=type), alpha = .2, size=2)+
#geom_point(aes(shape=type_match_fin, color=type_match_fin), size=2, alpha = 0.4)+
geom_point(data = rct3, aes(shape=type, color=type, size=ipw))+
scale_shape_manual(name="", values = c(16, 17))+
scale_color_manual(name="", values = cols)+
theme_bw()+
scale_x_continuous(name = "income")+
scale_y_continuous(name = "malaria risk") +
theme(axis.text.x = element_text(size=14),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5),
legend.position.inside = c(1, 0),
#Legend.position values should be between 0 and 1. c(0,0) corresponds to the "bottom left"
#and c(1,1) corresponds to the "top right" position.
legend.box.background = element_rect(fill='white'),
legend.background = element_blank(),
legend.text=element_text(size=12))
<-reposition_legend(figure_ipw, 'bottom left') figure_ipw
Let us now add this third model and compare it to the other two. We can see that the model with IPW is slightly higher than the one with matched data and smaller than the one with ipw weights.
<-lm(malaria_risk_post~net, data = rct_data)
x1
<-lm(malaria_risk_post~net, data = matched)
x2
<-lm(malaria_risk_post~net, data = rct3, weights=ipw)
x3
<-list("Naive Regression"= x1,
models"Matched"= x2,
"IPW" = x3)
modelsummary(models, stars = TRUE, gof_map = c("nobs", "r.squared"))
Naive Regression | Matched | IPW | |
---|---|---|---|
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |||
(Intercept) | 45.662*** | 45.662*** | 45.659*** |
(0.415) | (0.417) | (0.404) | |
net | -10.083*** | -10.069*** | -10.075*** |
(0.574) | (0.589) | (0.572) | |
Num.Obs. | 1000 | 956 | 1000 |
R2 | 0.236 | 0.234 | 0.237 |
In practice, the choice between Mahalanobis distance and Inverse Probability Weights will depend on how many observations we want to keep. In this is particular case, we only lost 44 observations (1000-956), so the Mahalanobis distance matching technique might be acceptable. In other applications, we may use IPW, as a solution to the problem of dropping too many observations.