# May 27, 2015
# Research Design for Causal Inference
# MTS 525
# Northwestern University
# Created by:
# Aaron Shaw (aaronshaw@northwestern.edu)
rm(list=ls())
# Set an appropriate working directory for your session
# setwd("Data/")
# Below is an example of Instrumental Variables Estimation (IVE) using
# two-stage least squares (tsls) regression from Murnane & Willett (2011)
# Install these if you don't have them. We'll conduct 2SLS with each one.
install.packages("sem")
install.packages("AER")
library(AER)
library(sem)
library(foreign) # this makes it easy to read Stata's ".dta" files.
library(Hmisc) # Sometimes I use this for the "describe()" function.
library(ggplot2)
# This Stata data file is available from UCLA on the page where they
# include examples from Murnane & Willett's book.
d <- read.dta("http://www.ats.ucla.edu/stat/stata/examples/methods_matter/chapter10/dee.dta")
# Some summary information about all the variables
sapply(d[,names(d)[names(d)!="schoolid"]],
describe)
sapply(d[,names(d)[names(d)!="schoolid"]],
summary)
# don't forget the variance:
sapply(d[,names(d)[names(d)!="schoolid"]], var)
# Now, examine the relationship between the proposed instrument and the
# key predictor:
table(d$distance, d$college)
summary(table(d$distance, d$college))
# for calculating correlations and covariance tests, see:
help(cor)
help(cor.test)
help(cov)
# correlation/covariance between the key predictor and the DV?
cor.test(d$register, d$college)
cov(d$register, d$college)
# naive model:
summary(lm(register ~ college, data=d))
# More correlation/covariance tests that Murnane & Willet present to
# help justify & think through the logic of the instrument:
cor.test(d$register, d$distance)
cor.test(d$college, d$distance)
cov(d$register, d$distance)
cov(d$college, d$distance)
# now for two-stage least squares:
# You can run the first stage manually to view the results:
m1 <- lm(college ~ distance, data =d)
summary(m1)
# Now you can generate a naive version of the second stage. BUT, note
# that the standard errors will be wrong (see Willett & Murnane,
# Angrist & Pischke, or Wooldridge for details):
instrument.college <- m1$fitted.values
m2 <- lm(register ~ instrument.college, data=d)
summary(m2)
# You can also use either the tsls() or ivreg() functions to get the
# tsls estimates & standard errors:
help(tsls)
help(ivreg)
m.tsls <- tsls(register ~ college, ~ distance, data = d)
summary(m.tsls)
# I prefer this output because it includes the results of a Wald test
# on the instrument to see whether it is "strong" or "weak"
m.ivreg <- ivreg(register ~ college | distance, data = d)
summary(m.ivreg)
# Now for the full model
# Stage 1 (by hand)
m.full.1 <- lm(college ~ hispanic + black + otherrace + distance, data=d))
summary(m.full.1)
# Stage 2 (by hand)
instrument.college <- m.full.1$fitted.values
m.full.2 <- lm(register ~ instrument.college + hispanic + black + otherrace, data=d)
summary(m.full.2)
# Stage 2
m2.ivreg <- ivreg(register ~ college + hispanic + black + otherrace |
distance + hispanic + black + otherrace, data=d)
summary(m2.ivreg)
# We could also specify a logistic or probit model for the dependent
# variable.
#
# See Murnane & Willett for further discussion of these results as
# well as the rationale guiding their model specification.