R Functions
Here is a growing collection of some (I think) handy R functions that I have written. I plan to complete some better documentation when the time becomes available.
Functions (use sidebar to navigate)
calcBy - Similar to bysort in stata...I haven't found a simple function that does what this function does (see file for instructions).
plm - Runs a semiparametric Partial Linear differencing Model (outlined in Yatchew 1997 and 1998) that allows for nonparametric estimation of the relationship between one independent variable and outcome, while controlling (linearly) for a vector of other influences.
balance - This is a handy function for analyses that use the Match function. One of the best features of this function is that it returns a simple data frame of balancing results that can easily be copy and pasted into Excel for the comparison of many matching specifications (those that use Match know that, while very thorough, Sekhon's balance function is not practical for comparing across specifications). Among other features, the function will return a data frame of matched treated and control units that result from Match. See file for more information.
countif - Function evaluates a logical expression and returns the number of elements that meet and do not meet the condition. Convenient for complex expressions. Function also will return a data frame with the elements of the original data that meet the specified condition.
calcBy
#function that emulates bysort from stata: Updated 02/26/2011
################################################################################################################
# ARGUMENTS #
# #
# x = name of variable (as string) for which calculations will be made #
# data = data frame in which the variables of interest lie #
# by = name of variable (as string) delineating the groups over which the calculations will be made #
# FUN = name of function (as string) for the calculation #
# sort = should the data be returned sorted according to the by variable? #
# collapse = should the duplicate observations (according to the by variable) by dropped? #
# #
# The function is designed (similar to bysort in Stata) to make calculations according to groups within the #
# data. #
# If you store the function in an object with the same name as the original data a new column will be added #
# to the data with the function name appended to the original variable name. #
# #
# if you store the function in a new object name, a new data From will be created with the new column. #
# #
################################################################################################################
################################################################################################################
calcBy <- function(x="varname.string", data=NULL, calc.data=NULL, by="by.varname.string", FUN="sum",
sort=FALSE, collapse=FALSE){
#first call the aggregate function
if(is.null(calc.data)){
agg.data <- aggregate(x=data[x], by=list(list = data[,by]), FUN=FUN)
#note that the comma in "data[,by]" b/c object is list
}
if(!is.null(calc.data)){#then use the calc.data to calculate FUN for the groups
agg.data <- aggregate(x=calc.data[x], by=list(list = calc.data[,by]), FUN=FUN)
#note that the comma in "data[,by]" b/c object is list
}
#change names for merge
names(agg.data) <- c(by, paste(x, FUN, sep="."))
#merge data back into main dataset
data <- merge(data, agg.data, by=by, all.x=T)
#remove the duplicate observations
if(collapse){
data$dup <- duplicated(data[by])
data <- subset(data, dup ==FALSE)
data$dup <- NULL
}#end if
#sort data
if(sort){
data <- data[order(data[by]),]
}#end if
return(data)
}#end function
plm
##Program to estimate partial linear model using loess at last stage to estimate nonparametric portion
plm <- function(data, x="String nonpar var", y="string for outcome", z="string for controls", order=10,
loess=T, weights, span = 0.75, seq=0, se= T, predict= T, degree = 2,
family = c("gaussian", "symmetric")){
#order the data according to the independent variable of interest
sdata <- data[order(data[x]),]
#store X, Y and Z variables in matrix for use in OLS
Xs <- as.matrix(sdata[x])
Ys <- as.matrix(sdata[y])
Zs <- as.matrix(sdata[z])
#Define the potential weighting vectors (See Yatchew 1997)
#Note that these are normalized weights so that, for instance, W1 is the same as taking simple 1st dif.
W1 <- c(0.7071, -0.7071)
W2 <- c(0.8090, -0.5000, -0.3090)
W3 <- c(0.1942, 0.2809, 0.3832, -0.8582)
W4 <- c(0.2708, -0.0142, 0.6909, -0.4858, -0.4617)
W5 <- c(0.9064, -0.2600, -0.2167, -0.1774, -0.1420, -0.1103)
W6 <- c(0.2400, 0.0300, -0.0342, 0.7738, -0.3587, -0.3038, -0.3472)
W7 <- c(0.9302, -0.1965, -0.1728, -0.1506, -0.1299, -0.1107, -0.0930, -0.0768)
W8 <- c(0.2171, 0.0467, -0.0046, -0.0348, 0.8207, -0.2860, -0.2453, -0.2260, -0.2879)
W9 <- c(0.9443, -0.1578, -0.1429, -0.1287, -0.1152, -0.1025, -0.0905, -0.0792, -0.0687, -0.0588)
W10 <- c(0.9494, -0.1437, -0.1314, -0.1197, -0.1085, -0.0978, -0.0877, -0.0782, -0.0691, -0.0606, -0.0527)
#Round order in case of non-integer
order <- round(order, 0)
if(order < 1 | order > 10) {
return("The order of the weighting matrix must lie between 1 and 10 (inclusive)")
}#end if
#Identify the weighting vector according to the value of order
W <- get(paste("W", order, sep=""))
##Take difference for Y and Z according to the order of X
#counter
j = 1
#define variables for loop
Y.delta <- vector()
Z.delta <- matrix(0, (nrow(Zs) - order), ncol(Zs))
#This builds the matrix of first difference observations
for(i in (order + 1):nrow(Xs)){
Y.delta[j] <- Ys[i] * W[1]
for(c in 1:ncol(Zs)){
Z.delta[j,c] <- Zs[i,c] * W[1]
} #end for
j=j+1
} #end for
#Now calculate the remainder of the differencing
for(o in 1:order){
j = 1
#Loop over the rows
for(i in (order + 1):nrow(Xs)){
Y.delta[j] <- Y.delta[j] + Ys[i - o] * W[o + 1]
#Loop over the columns
for(c in 1:ncol(Zs)){
Z.delta[j,c] <- Z.delta[j,c] + Zs[i - o,c] * W[o + 1]
} #end for columns
j=j+1
} #end for rows
} #end for order
#regression without intercept
reg <- lm(Y.delta ~ 0 + Z.delta)
print(summary(reg))
#Get the coefficients from the differnce regression
b.dif <- reg$coefficients
#This estimates y.plm = Ys - Z*Bdiff which can then be plotted against X to determin f(X) (
#NOTE THAT IT IS DEMEANED A LA THE STATA VERSION)
y.plm <- Ys - (Zs %*% b.dif - mean(Zs %*% b.dif))
#######################################################################
#Graphing section
if(loess){
print("here")
if(seq==0){
seq <- seq(min(Xs), max(Xs), length.out=30)
}#end if seq
#Use loess to estimate the nonparametric function
low <- loess(y.plm~Xs, span=span, degree=degree, family=family)
lowp <- predict(low, seq, se=F)
plot(seq, lowp, type="l", ylab=y, xlab=x)
if(se){
#Predict sequence of loess smoothed points along X range with standard errors
lowp <- predict(low, seq, se=T)
lines(seq, (lowp$fit + 1.96*lowp$se), lty=2)
lines(seq, (lowp$fit - 1.96*lowp$se), lty=2)
}#end if se
}#end if loess
#######################################################################
#Return output values
plm.out <- list(y.plm=y.plm, x.plm=Xs, loess.fit=low$fit, pred.fit=lowp$fit, se=lowp$se)
return(plm.out)
}#end function
balance
#__________________________________________________________________________#
# Merlin M. Hanauer 8/23/2009 #
# balance function returns pre- and post-match balance information, in #
# tabular form, and the full post-match data set of matched pairs, #
# (optional) after Match has been run #
# input: #
# dataAA- the original dataset or covariate matrix, used to establish #
# pre-match balance. NOTE that if you want simple balance #
# measures based on the covariate matrix you used in Match, #
# then dataAA should use the same matrix as used in Match, #
# without using names. if the matched set is requested in this #
# specifacation, then the covariats and treatment and Y's will #
# be returned. By setting dataAA equal to your original dataset#
# , specifying column names and setting matchout=TRUE, the #
# matched dataset will include all the variabls from the #
# original dataset. #
# matchZ- the object returned by Match #
# treatin- the treatment indicator for dataset in which Match was #
# performed, should be a string #
# names- a vector of names (optional). this should be used if you #
# would like to show balance information for covariates #
# that were not included as covariates in Match #
# matchout- if set to TRUE, balance will return the post match set of #
# matched pairs from dataAA #
# fromMatch-indicates if matchZ is coming from Match, if just comparing #
# 2 established data sets, set to FALSE #
# dataM- this should only be used if you have an expanded data set #
# that includes all observations from an original #
# matching specification (in addition to the original data #
# set) **used for very specific case, should be=NULL #
# impute- default is FALSE, tells the function to impute outcomes for #
# treated units according to the bias-adjustment matching #
# estimator #
# the names vector should be designated prior to calling balance, i.e., #
# names <- cbind("name_1", "name_2,..., "name_N") #
# where name_n is equal to the column name of interest from the original #
# data set #
#__________________________________________________________________________#
balance <- function(dataAA, matchZ, Y=NULL, treatin, names=NULL, matchout=FALSE, fromMatch=TRUE, dataM=dataAA, impute=FALSE) {
#create observation numbers for each data set so that a matched set can be created
dataAA$obs <- seq(nrow(dataAA))
dataM$obs <- seq(nrow(dataM))
#condition if names are not used, only balance on match covariates will be returned
if (is.null(names)) {
#assign unmatched data
dataA <- as.data.frame(dataAA)
#assign treatment column
treat <- dataAA[treatin]
#create matched data set
matchset <- as.data.frame(matchZ$mdata$X)
tr <- as.data.frame(matchZ$mdata$Tr)
nvars <- ncol(matchset)
#create subsets of matched and unmatched data so as to calc sd
matcht <- subset(matchset, tr == 1)
matchc <- subset(matchset, tr == 0)
tdataA <- subset(dataA, treat == 1)
cdataA <- subset(dataA, treat == 0)
#this portion is just to create a matched set
#get observation number from matched data set for creating control weights later
ti <- as.data.frame(matchZ$index.treated)
names(ti)[1] <- "obs"
tc <- as.data.frame(matchZ$index.control)
names(tc)[1] <- "obs"
obsa <- rbind(ti, tc)
obsa <- as.data.frame(obsa)
matchall <- merge(obsa, dataA, by="obs", all.x="TRUE", sort=FALSE)
#if names are used then balance measures will be returned for all variables in "names"
}
else if(fromMatch==TRUE) { #note that the end if bracket and the else must be on the same line
#get observation number from matched data set for creating control weights later
ti <- as.data.frame(matchZ$index.treated)
#add a sequence that will identify which control observations were
#matched to which treated observations after the merge
ti$match.id <- seq(nrow(ti))
names(ti)[1] <- "obs"
tc <- as.data.frame(matchZ$index.control)
#add a sequence that will identify which control observations were
#matched to which treated observations after the merge
tc$match.id <- seq(nrow(tc))
names(tc)[1] <- "obs"
obsa <- as.data.frame(rbind(ti, tc))
#assign treatment data frame and order NY observation number so that it will align with the covariate data
tr <- as.data.frame(matchZ$mdata$Tr)
#this will drop all observations from the original data set that are not a part of the matched data set
matchall <- merge(obsa, dataM, by="obs", all.x=TRUE, sort=FALSE)
#sort according to match.id so that the stacked data correspond to actual matches (i.e., treted observation
#1 is matched with control observation 1 etc)
matchall <- matchall[order(-matchall[treatin], matchall$match.id),]
#create matchset
matchset <- matchall[names]
#matchsets by treatment
matchsetat <- subset(matchall, tr == 1)
matchsetac <- subset(matchall, tr == 0)
matcht <- matchsetat[names]
matchc <- matchsetac[names]
#create unmatched set
dataA <- dataAA[names]
#assign treatment column
treat <- dataAA[treatin]
#create unmatched set by treatment
tdataA <- subset(dataA, treat == 1)
cdataA <- subset(dataA, treat == 0)
nvars <- ncol(dataA)
}
else if (fromMatch==FALSE) {
#instead of creating a data set from the output of Match, this will
#use the user specified data set as a Comparison
#(matchZ) to the original data set (dataAA)
matchall <- matchZ
#create matchset
matchset <- matchall[names]
#define treatment for "matchset"
tr <- matchall[treatin]
#matchsets by treatment
matchsetat <- subset(matchall, tr == 1)
matchsetac <- subset(matchall, tr == 0)
matcht <- matchsetat[names]
matchc <- matchsetac[names]
#create unmatched set
dataA <- dataAA[names]
#assign treatment column
treat <- dataAA[treatin]
#create unmatched set by treatment
tdataA <- subset(dataA, treat == 1)
cdataA <- subset(dataA, treat == 0)
nvars <- ncol(dataA)
} #end if
#now add imputed values, if specified, to the dataframe
if(impute == TRUE) {
#control outcomes
yc <- matchsetac[Y]
matchset
#estimate coefficients for regression of controls units
xc <- as.matrix(cbind(as.matrix(matchc), matrix(1, nrow(matchc), 1)))
b.c <-inv(t(xc) %*% xc) %*% t(xc) %*% as.matrix(yc)
yhat <- as.data.frame(as.matrix(cbind(matchall[names], matrix(1, nrow(matchall), 1))) %*% b.c)
names(yhat)[1] <- "yhat"
matchall <- data.frame( matchall, yhat)
}
#dim all the variable to be used in loop
balu <- list()
balm <- list()
muT <- 1
muC <- 1
mdifu <- 1
eqqu <- 1
mmT <- 1
mmC <- 1
mdifm <- 1
eqqm <- 1
pctimp <- 1
meanT <- vector()
meanC <- vector()
difmean <- vector()
eqqdif <- vector()
pctimpO <- vector()
normdifu <- vector()
normdifm <- vector()
normdif <- vector()
pctimprove <- vector()
names2 <- vector()
mu <- vector()
#loop over each variable to calculate and compile stats
for (i in 1:nvars) {
#calls Sekhon's univariate balance function for unmatched variables
balu <- balanceUV(dataA[, i][treat ==1], dataA[, i][treat != 1])
#means unmatched// note store with [[]] call with []
muT[[i]] <- balu$mean.Tr
muC[[i]] <- balu$mean.Co
#mean dif unmatched
mdifu[[i]] <- muT[[i]] - muC[[i]]
#raw mean Esq from balanceUV
eqqu[[i]] <- balu$qqsummary.raw$meandiff
#normalized difference for unmatched, as suggested by Imbens
normdifu[i] <- abs(mdifu[i] / sqrt(sd(tdataA[, i])^2 + sd(cdataA[, i]^2)))
#calls Sekhon's univariate balance function for animated variables
balm <- balanceUV(matchset[, i][tr ==1], matchset[, i][tr != 1])
#means matched set
mmT[[i]] <- balm$mean.Tr
mmC[[i]] <- balm$mean.Co
#mean dif matched
mdifm[[i]] <- mmT[[i]] - mmC[[i]]
#Esq for matched
eqqm[[i]] <- balm$qqsummary.raw$meandiff
#normdif for matched
normdifm[i] <- abs(mdifm[i] / sqrt(sd(matcht[, i])^2 + sd(matchc[, i]^2)))
#pct meandif imp
pctimp[[i]] <- (abs(mdifu[[i]]) - abs(mdifm[[i]])) / abs(mdifu[[i]])
#this combines "stacks" all stats for each loop
meanT <- rbind(meanT, muT[i], mmT[i])
meanC <- rbind(meanC, muC[i], mmC[i])
difmean <- rbind(difmean, mdifu[i], mdifm[i])
normdif <- rbind(normdif, normdifu[i], normdifm[i])
eqqdif <- rbind(eqqdif, eqqu[i], eqqm[i])
pctimprove <- rbind(pctimprove, 0, pctimp[i])
if (is.null(names)) {
names2 <- rbind(names2, names(dataA)[i], names(dataA)[i])
} else {
names2 <- rbind(names2, names[i], names[i])
}
mu <- rbind(mu, "Unmatched", "Matched")
}
#set output
#condition, if the user specifies the return of a matched set
if (matchout) {
match.set <- as.data.frame(matchall)
bal <- data.frame(names2, mu, meanT, meanC, difmean, normdif, eqqdif, pctimprove)
out <- list(bal=bal, match.set=match.set)
} else {
out <- data.frame(names2, mu, meanT, meanC, difmean, normdif, eqqdif, pctimprove)
}
return(out)
}
countif
#Function: countif
#####################################################################################
# #
# Function evaluates a logical expression and returns the number of elements that #
# meet and do not meet the condition. Convenient for complex expressions. Function #
# also will return a data frame with the elements of the original data that meet #
# the specified condition. #
# #
# Arguments: #
# exp = Logical expression to be evaluated. Can be specified as string (most #
# convenient) or in the form data$column. #
# #
# data = Data on which exp will be evaluated. If data=NULL then the most #
# recent attached (see attach()) data will be used to evaluate exp. #
# #
# store= Logical, indicate whether or not elements of data that meet condition #
# exp should be stored in a new object. #
# #
#####################################################################################
countif <- function(exp=expression, data=NULL, store=FALSE){
#if no data are specified then use the attached data
if(is.null(data)){
cat("Data was not specified, so using data that was attached last", "\n")
s <- search()
data <- get(s[2])
}#end if(is.null(data))
#evaluate depending on the type of input for "exp"
if(typeof(exp) == "character"){
temp <- subset(data, eval(parse(text=exp)))
} else {
storage.mode(exp) <- "logical"
temp <- subset(data, exp)
}#end if/else(typeof)
#print results
cat("Count True = ", nrow(temp), "\n")
cat("Count False = ", nrow(data) - nrow(temp), "\n")
#if called, store data that meets the condition "exp" for return
if(store){
cond.data <- as.data.frame(temp)
return(cond.data)
}#end if(store)
}