## Thursday, November 13, 2014

### Propensity Score Matching in R (My first post in blogger)

My first post in blogger is about Propensity Score Matching (PSM) in R. In this post I will introduce the R packages MatchIt and Zelig. The first one is devoted to perform different PSM algorithms and the other one is used in order to estimate the average treatment effect (ATE) and the effect of the treatment on the treated (ATT). First of all, I will simulate a population where some individuals receive the treatment, and the remaining ones do not.
rm(list=ls())
library(MatchIt)
library(Zelig)
N <- 1000
x1 <- round(rnorm(N,20,4))
x2 <- round(rnorm(N,80,10))
f <- 1 + 1*x1 - 0.28*x2
p <- exp(f)/(1+exp(f))plot(p)
z <- rbinom(N,1, p)table(z)
y <- rep(0, times=N)
# Different values for y over z# The mean difference for y is 5000
y[z == 1] <- 9000 + 2*x1[z == 1] + 0.5*x2[z == 1]
y[z == 0] <- 4000 + 2*x1[z == 0] + 0.5*x2[z == 0]
y <- jitter(y, amount=1000,factor=2)
Data <- data.frame(y=y, x1=x1, x2=x2, z)
# The treatment population
Treats <- subset(Data, z == 1)
colMeans(Treats)
# The control population
Control <- subset(Data, z == 0)colMeans(Control)
# Computing the real effect of the treatment
model <- lm(y ~ z + x1 + x2,data=Data)
Effect <- model$coeff[2] Effect   Then I will run a logistic regression in order to estimate the propensity scores. That is the probability of receiving the treatment. # The propensity scores  ps.model <- glm(z ~ x1 + x2,family = binomial(link = "logit"),data = Data) plot(ps.model$fitted)summary(ps.model)
Data$ps <- ps.model$fitted table(Data$z)  hist(Data$ps[z==1])
hist(Data$ps[z==0])   After that, one may use different matching algorithms. Exact matching is a technique used to match individuals based on the exact values of covariates. # Exact matching  m1.ps <- matchit(ps.model, method="exact",data=Data) summary(m1.ps, covariates = T)  m1.data <- match.data(m1.ps)    Nearest neighbour matching (NNM) is an algorithm that matches individuals with controls (it could be more two or more controls per treated unit) one by one based on a distance. #Nearest neighbor matching m3.ps <- matchit(ps.model, method="nearest", radio=1,data=Data) summary(m3.ps, covariates = T) plot(m3.ps) plot(m3.ps, type="jitter") plot(m3.ps, type="hist")  m3.data <- match.data(m3.ps)  m3.ps$match.matrix
Pairs <- cbind(Data[row.names(m3.ps$match.matrix),], Data[m3.ps$match.matrix,])
Pairs

Optimal matching solves the problem inherent to the NNM, that matches units one at a time, without taking into account any global (over the entire population) distance measure. This way, with NNM the last treated unit could could match an inappropriate unit.
#Optimal matching
m5.ps <- matchit(ps.model, method="optimal",data=Data)
summary(m5.ps, covariates = T)
plot(m5.ps)
plot(m5.ps, type="jitter")
plot(m5.ps, type="hist")
m5.data <- match.data(m5.ps)
# m5.data[(m5.data$subclass == 1),]   Full matching forms subgroups in an optimal way. #Full matching  m4.ps <- matchit(ps.model, method="full",data=Data) summary(m4.ps, covariates = T) plot(m4.ps) plot(m4.ps, type="jitter") plot(m4.ps, type="hist")  m4.data <- match.data(m4.ps)   Caliper matching is a refinement of the NNM. It defines a constraint over the individual distant measure. So, it minimises the probability of bad matching. # Caliper matching  m8.ps <- matchit(ps.model, method="nearest", caliper=0.25*sd.ps,data=Data) summary(m8.ps, covariates = T) plot(m8.ps) plot(m8.ps, type="jitter") plot(m8.ps, type="hist")  m8.data <- match.data(m8.ps)  m8.data.control <- match.data(m8.ps,"control")  m8.data.treat <- match.data(m8.ps,"treat")  m8.ps$match.matrix
Pairs <- cbind(Data[row.names(m8.ps$match.matrix),], Data[m8.ps$match.matrix,])
Pairs

Finally, with the matched data, we estimate the average treatment effects, via automatic simulation, with the Zelig package.
# Treatment Effects
z1.ps <- zelig(y ~ z + x1 + x2,data=m8.data, model="ls")
summary(z1.ps)
x <- setx(z1.ps, z = 0) x1 <- setx(z1.ps, z = 1)
s.out <- sim(z1.ps, x = x, x1 = x1)
summary(s.out)
plot(s.out)


Zelig package also allows to estimate the effect on the treated, and the effect on the controls.
# Average Treatment Effect on the Controls
z1c.ps <- zelig(y ~ x1 + x2,data = m8.data.control, model="ls")
summary(z1c.ps)
x.out1 <- setx(z1c.ps,data = m8.data.treat,cond = TRUE)
s.out1 <- sim(z1c.ps, x = x.out1)summary(s.out1)
plot(s.out1)


# Average Treatment Effect on the Treated
z1t.ps <- zelig(y ~ x1 + x2,data = m8.data.treat, model="ls")
summary(z1t.ps)
x.out2 <- setx(z1t.ps,data = m8.data.control,cond = TRUE)
s.out2 <- sim(z1t.ps, x = x.out2)
summary(s.out2)
plot(s.out2)

I hope that you found this post valuable. Feel free to reproduce the code wherever you want.