R Functions

Here is a growing collection of some (I think) handy R functions that I have written. I plan to complete some better documentation when the time becomes available.

Homework

Functions (use sidebar to navigate)

calcBy - Similar to bysort in stata...I haven't found a simple function that does what this function does (see file for instructions).

plm - Runs a semiparametric Partial Linear differencing Model (outlined in Yatchew 1997 and 1998) that allows for nonparametric estimation of the relationship between one independent variable and outcome, while controlling (linearly) for a vector of other influences.

balance - This is a handy function for analyses that use the Match function. One of the best features of this function is that it returns a simple data frame of balancing results that can easily be copy and pasted into Excel for the comparison of many matching specifications (those that use Match know that, while very thorough, Sekhon's balance function is not practical for comparing across specifications). Among other features, the function will return a data frame of matched treated and control units that result from Match. See file for more information.

countif - Function evaluates a logical expression and returns the number of elements that meet and do not meet the condition. Convenient for complex expressions. Function also will return a data frame with the elements of the original data that meet the specified condition.

calcBy

#function that emulates bysort from stata: Updated 02/26/2011

################################################################################################################
#	                                             ARGUMENTS                                                       #
#	                                                                                                             #
#	       x = name of variable (as string) for which calculations will be made                                  #
#	    data = data frame in which the variables of interest lie                                                 #
#	      by = name of variable (as string) delineating the groups over which the calculations will be made      #
#	     FUN = name of function (as string) for the calculation                                                  #
#	    sort = should the data be returned sorted according to the by variable?                                  #
#	 collapse = should the duplicate observations (according to the by variable) by dropped?                     #
#	                                                                                                             #
#  The function is designed (similar to bysort in Stata) to make calculations according to groups within the   #
#	 data.                                                                                                       #
#  If you store the function in an object with the same name as the original data a new column will be added   #
#	 to the data with the function name appended to the original variable name.                                  #
#	                                                                                                             #
#	 if you store the function in a new object name, a new data From will be created with the new column.        #
#                                                                                                              #
################################################################################################################




################################################################################################################                                                                                               
                                                                                                               
calcBy <- function(x="varname.string", data=NULL, calc.data=NULL, by="by.varname.string", FUN="sum",
		  sort=FALSE, collapse=FALSE){
	
	#first call the aggregate function
	
	if(is.null(calc.data)){
				agg.data <- aggregate(x=data[x], by=list(list = data[,by]), FUN=FUN)  
                #note that the comma in "data[,by]" b/c object is list
	}
			
	if(!is.null(calc.data)){#then use the calc.data to calculate FUN for the groups
				agg.data <- aggregate(x=calc.data[x], by=list(list = calc.data[,by]), FUN=FUN)  
                #note that the comma in "data[,by]" b/c object is list
	}

	#change names for merge
	names(agg.data) <- c(by, paste(x, FUN, sep="."))
	
	#merge data back into main dataset
	data <- merge(data, agg.data, by=by, all.x=T)
	
	#remove the duplicate observations
	if(collapse){
    		data$dup <- duplicated(data[by])
    		data <- subset(data, dup ==FALSE)
    		data$dup <- NULL
	}#end if
	
	#sort data
	if(sort){
		data <- data[order(data[by]),]
	}#end if
	
	return(data)
}#end function
	
	

plm

##Program to estimate partial linear model using loess at last stage to estimate nonparametric portion

plm <- function(data, x="String nonpar var", y="string for outcome", z="string for controls", order=10,
			loess=T, weights, span = 0.75, seq=0, se= T, predict= T, degree = 2, 
                        family = c("gaussian", "symmetric")){
	#order the data according to the independent variable of interest
	sdata <- data[order(data[x]),]
	
	#store X, Y and Z variables in matrix for use in OLS
	Xs <- as.matrix(sdata[x])
	
	Ys <- as.matrix(sdata[y])
	
	Zs <- as.matrix(sdata[z])
	
	#Define the potential weighting vectors (See Yatchew 1997)
	#Note that these are normalized weights so that, for instance, W1 is the same as taking simple 1st dif.	
	W1  <- c(0.7071, -0.7071)
	W2  <- c(0.8090, -0.5000, -0.3090)
	W3  <- c(0.1942,  0.2809,  0.3832, -0.8582)
	W4  <- c(0.2708, -0.0142,  0.6909, -0.4858, -0.4617)
	W5  <- c(0.9064, -0.2600, -0.2167, -0.1774, -0.1420, -0.1103)
	W6  <- c(0.2400,  0.0300, -0.0342,  0.7738, -0.3587, -0.3038, -0.3472)
	W7  <- c(0.9302, -0.1965, -0.1728, -0.1506, -0.1299, -0.1107, -0.0930, -0.0768)
	W8  <- c(0.2171,  0.0467, -0.0046, -0.0348,  0.8207, -0.2860, -0.2453, -0.2260, -0.2879)
	W9  <- c(0.9443, -0.1578, -0.1429, -0.1287, -0.1152, -0.1025, -0.0905, -0.0792, -0.0687, -0.0588)
	W10 <- c(0.9494, -0.1437, -0.1314, -0.1197, -0.1085, -0.0978, -0.0877, -0.0782, -0.0691, -0.0606, -0.0527)

	#Round order in case of non-integer
	order <- round(order, 0)
	
	if(order < 1 | order > 10) {
		return("The order of the weighting matrix must lie between 1 and 10 (inclusive)")
	}#end if
	
	#Identify the weighting vector according to the value of order
	W <- get(paste("W", order, sep=""))
	
	##Take difference for Y and Z according to the order of X
	#counter
	j = 1
	#define variables for loop
	Y.delta <- vector()
	Z.delta <- matrix(0, (nrow(Zs) - order), ncol(Zs))
	
	#This builds the matrix of first difference observations
		for(i in (order + 1):nrow(Xs)){
				Y.delta[j] <- Ys[i] * W[1]
		for(c in 1:ncol(Zs)){
				Z.delta[j,c] <- Zs[i,c] * W[1]
		} #end for	
				j=j+1
	} #end for
	
	#Now calculate the remainder of the differencing
	for(o in 1:order){
		j = 1
		#Loop over the rows
		for(i in (order + 1):nrow(Xs)){
			Y.delta[j] <- Y.delta[j] + Ys[i - o] * W[o + 1]
			#Loop over the columns
			for(c in 1:ncol(Zs)){
					Z.delta[j,c] <- Z.delta[j,c] + Zs[i - o,c] * W[o + 1]
			} #end for columns
			j=j+1
		}	 #end for rows
	} #end for order
	#regression without intercept	
	reg <- lm(Y.delta ~  0 + Z.delta)
	print(summary(reg))
	
	#Get the coefficients from the differnce regression
	b.dif <- reg$coefficients
	
	#This estimates y.plm = Ys - Z*Bdiff which can then be plotted against X to determin f(X) (
    #NOTE THAT IT IS DEMEANED A LA THE STATA VERSION)
	y.plm <- Ys - (Zs %*% b.dif - mean(Zs %*% b.dif))
	
	
	#######################################################################
	#Graphing section
	
	if(loess){
		print("here")
		if(seq==0){
			seq <- seq(min(Xs), max(Xs), length.out=30)
		}#end if seq 
		#Use loess to estimate the nonparametric function
		low <- loess(y.plm~Xs, span=span, degree=degree, family=family)
		lowp <- predict(low, seq, se=F)
		plot(seq, lowp, type="l", ylab=y, xlab=x)
		if(se){
			#Predict sequence of loess smoothed points along X range with standard errors
			lowp <- predict(low, seq, se=T)
			lines(seq, (lowp$fit + 1.96*lowp$se), lty=2)
			lines(seq, (lowp$fit - 1.96*lowp$se), lty=2)			
		}#end if se	
	}#end if loess		
	#######################################################################
	
	#Return output values
	plm.out <- list(y.plm=y.plm, x.plm=Xs, loess.fit=low$fit, pred.fit=lowp$fit, se=lowp$se)
	return(plm.out)
}#end function

balance

#__________________________________________________________________________#
# Merlin M. Hanauer 8/23/2009                                              #
# balance function returns pre- and post-match balance information, in     #
# tabular form, and the full post-match data set of matched pairs,         #
# (optional) after Match has been run                                      #
# input:                                                                   # 
# 	dataAA-		the original dataset or covariate matrix, used to establish  #
#				pre-match balance. NOTE that if you want simple balance      #
#				measures based on the covariate matrix you used in Match,    #
#				then dataAA should use the same matrix as used in Match,     #
#				without using names. if the matched set is requested in this #
#				specifacation, then the covariats and treatment and Y's will #
#				be returned. By setting dataAA equal to your original dataset#
#				, specifying column names and setting matchout=TRUE, the     #
#				matched dataset will include all the variabls from the       #
#				original dataset.                                            #
# 	matchZ-		the object returned by Match                                 #
#	treatin- 	the treatment indicator for dataset in which Match was       # 
#							performed, should be a string                                #
#	names- 		a vector of names (optional). this should be used if you     #
#				would like to show balance information for covariates        #
#				that were not included as covariates in Match                #
#	matchout- if set to TRUE, balance will return the post match set of    # 
#				matched pairs from dataAA                                    #
#	fromMatch-indicates if matchZ is coming from Match, if just comparing  #
#				2 established data sets, set to FALSE                        # 
#	dataM- 		this should only be used if you have an expanded data set    #
#				that includes all observations from an original              # 
#		 		matching specification (in addition to the original data     #
#				set) **used for very specific case, should be=NULL           #
#   impute-   default is FALSE, tells the function to impute outcomes for  #
#				treated units according to the bias-adjustment matching      #
#				estimator                                                    #
# the names vector should be designated prior to calling balance, i.e.,    #
#	  names <- cbind("name_1", "name_2,..., "name_N")                      #
# where name_n is equal to the column name of interest from the original   #
# data set                                                                 #  
#__________________________________________________________________________#




balance <- function(dataAA, matchZ, Y=NULL, treatin, names=NULL, matchout=FALSE, fromMatch=TRUE, dataM=dataAA, impute=FALSE) {
		#create observation numbers for each data set so that a matched set can be created
		dataAA$obs <- seq(nrow(dataAA))
		dataM$obs <- seq(nrow(dataM))
		
#condition if names are not used, only balance on match covariates will be returned
if (is.null(names)) {		
						
		#assign unmatched data 
		dataA <- as.data.frame(dataAA)
					
		#assign treatment column
		treat <- dataAA[treatin] 
				
		#create matched data set
		matchset <- as.data.frame(matchZ$mdata$X) 
		tr <- as.data.frame(matchZ$mdata$Tr)
		nvars <- ncol(matchset)
				
		#create subsets of matched and unmatched data so as to calc sd
		matcht <- subset(matchset, tr == 1)
		matchc <- subset(matchset, tr == 0)
				
		tdataA <- subset(dataA, treat == 1)
		cdataA <- subset(dataA, treat == 0)
				
		#this portion is just to create a matched set
		#get observation number from matched data set for creating control weights later 
		ti <- as.data.frame(matchZ$index.treated) 
		names(ti)[1] <- "obs"
		tc <- as.data.frame(matchZ$index.control) 
		names(tc)[1] <- "obs"
		obsa <- rbind(ti, tc)
		obsa <- as.data.frame(obsa)
				
		matchall <- merge(obsa, dataA, by="obs", all.x="TRUE", sort=FALSE)
		
    #if names are used then balance measures will be returned for all variables in "names"		
}	
else if(fromMatch==TRUE) {  #note that the end if bracket and the else must be on the same line
				
		#get observation number from matched data set for creating control weights later 
		ti <- as.data.frame(matchZ$index.treated)
		#add a sequence that will identify which control observations were 
        #matched to which treated observations after the merge
		ti$match.id <- seq(nrow(ti)) 
		names(ti)[1] <- "obs"
				
		tc <- as.data.frame(matchZ$index.control) 
		#add a sequence that will identify which control observations were 
        #matched to which treated observations after the merge
		tc$match.id <- seq(nrow(tc)) 
		names(tc)[1] <- "obs"

		obsa <- as.data.frame(rbind(ti, tc))
			
		#assign treatment data frame and order NY observation number so that it will align with the covariate data
		tr <- as.data.frame(matchZ$mdata$Tr)
						
		#this will drop all observations from the original data set that are not a part of the matched data set
		matchall <- merge(obsa, dataM, by="obs", all.x=TRUE, sort=FALSE)
				
		#sort according to match.id so that the stacked data correspond to actual matches (i.e., treted observation
        #1 is matched with control observation 1 etc)
		matchall <- matchall[order(-matchall[treatin], matchall$match.id),]
					
		#create matchset
		matchset <- matchall[names]
				
		#matchsets by treatment
		matchsetat <- subset(matchall, tr == 1)
		matchsetac <- subset(matchall, tr == 0)
		matcht <- matchsetat[names]
		matchc <- matchsetac[names]
					
		#create unmatched set
		dataA <- dataAA[names]
				
		#assign treatment column
		treat <- dataAA[treatin] 
				
		#create unmatched set by treatment
		tdataA <- subset(dataA, treat == 1)
		cdataA <- subset(dataA, treat == 0)
			
		nvars <- ncol(dataA)
}
else if (fromMatch==FALSE) {			
										
		#instead of creating a data set from the output of Match, this will 
        #use the user specified data set as a Comparison
		#(matchZ) to the original data set (dataAA)
		matchall <- matchZ
		
		#create matchset
		matchset <- matchall[names]
				
		#define treatment for "matchset"
		tr <- matchall[treatin]
			
		#matchsets by treatment
		matchsetat <- subset(matchall, tr == 1)
		matchsetac <- subset(matchall, tr == 0)
		matcht <- matchsetat[names]
		matchc <- matchsetac[names]
					
		#create unmatched set
		dataA <- dataAA[names]
				
		#assign treatment column
		treat <- dataAA[treatin] 
				
		#create unmatched set by treatment
		tdataA <- subset(dataA, treat == 1)
		cdataA <- subset(dataA, treat == 0)
			
		nvars <- ncol(dataA)
} #end if
			
#now add imputed values, if specified, to the dataframe
		
if(impute == TRUE) {
	#control outcomes
	yc <- matchsetac[Y]
	matchset			
	#estimate coefficients for regression of controls units
	xc <- as.matrix(cbind(as.matrix(matchc), matrix(1, nrow(matchc), 1)))
			
	b.c <-inv(t(xc) %*% xc) %*% t(xc) %*% as.matrix(yc)
			
	yhat <- as.data.frame(as.matrix(cbind(matchall[names], matrix(1, nrow(matchall), 1))) %*% b.c)
	names(yhat)[1] <- "yhat"
	matchall <- data.frame( matchall, yhat)
	}
			
			
			
			
	#dim all the variable to be used in loop
	balu <- list()
	balm <- list()
	muT <- 1
	muC <- 1
	mdifu <- 1
	eqqu <- 1
	mmT <- 1
	mmC <- 1
	mdifm <- 1
	eqqm <- 1
	pctimp <- 1
	meanT <- vector()
	meanC <- vector()
	difmean <- vector()
	eqqdif <- vector()
	pctimpO <- vector()
	normdifu <- vector()
	normdifm <- vector()
	normdif <- vector()
	pctimprove <- vector()
	names2 <- vector()
	mu <- vector()
			
	#loop over each variable to calculate and compile stats
	for (i in 1:nvars) {
					
			#calls Sekhon's univariate balance function for unmatched variables
			balu <- balanceUV(dataA[, i][treat ==1], dataA[, i][treat != 1])
			
			#means unmatched// note store with [[]] call with []
			muT[[i]] <- balu$mean.Tr
			muC[[i]] <- balu$mean.Co
					
			#mean dif unmatched
			mdifu[[i]] <- muT[[i]] - muC[[i]]
			#raw mean Esq from balanceUV
			eqqu[[i]] <- balu$qqsummary.raw$meandiff
			#normalized difference for unmatched, as suggested by Imbens
			normdifu[i] <- abs(mdifu[i] / sqrt(sd(tdataA[, i])^2 + sd(cdataA[, i]^2))) 
					
			#calls Sekhon's univariate balance function for animated variables
			balm <- balanceUV(matchset[, i][tr ==1], matchset[, i][tr != 1])
					
			#means matched set
			mmT[[i]] <- balm$mean.Tr
			mmC[[i]] <- balm$mean.Co
					
			#mean dif matched
			mdifm[[i]] <- mmT[[i]] - mmC[[i]]
			#Esq for matched
			eqqm[[i]] <- balm$qqsummary.raw$meandiff
			#normdif for matched
			normdifm[i] <- abs(mdifm[i] / sqrt(sd(matcht[, i])^2 + sd(matchc[, i]^2)))
					
			#pct meandif imp
			pctimp[[i]] <- (abs(mdifu[[i]]) - abs(mdifm[[i]])) / abs(mdifu[[i]])
					
			#this combines "stacks" all stats for each loop
			meanT <- rbind(meanT, muT[i], mmT[i])
			meanC <- rbind(meanC, muC[i], mmC[i])
			difmean <- rbind(difmean, mdifu[i], mdifm[i])
			normdif <- rbind(normdif, normdifu[i], normdifm[i])
			eqqdif <- rbind(eqqdif, eqqu[i], eqqm[i])
			pctimprove <- rbind(pctimprove, 0, pctimp[i])
					
			if (is.null(names)) {
					names2 <- rbind(names2, names(dataA)[i], names(dataA)[i]) 
			} else {
					names2 <- rbind(names2, names[i], names[i]) 
			}
					
			mu <- rbind(mu, "Unmatched", "Matched")
	}
			
	#set output
	#condition, if the user specifies the return of a matched set
	if (matchout) {
			match.set <- as.data.frame(matchall)
			bal <- data.frame(names2, mu, meanT, meanC, difmean, normdif, eqqdif, pctimprove)
			out <- list(bal=bal, match.set=match.set)
	}	else {
			out <- data.frame(names2, mu, meanT, meanC, difmean, normdif, eqqdif, pctimprove)
	}		
	return(out)
}

countif

#Function: countif

#####################################################################################
# 																																									#
# Function evaluates a logical expression and returns the number of elements that   #
# meet and do not meet the condition. Convenient for complex expressions. Function  #
# also will return a data frame with the elements of the original data that meet    #
# the specified condition.                                                          #
#                                                                                   #
# Arguments:                                                                        #
#   exp  = Logical expression to be evaluated. Can be specified as string (most   #
#	    	 convenient) or in the form data$column.                                #
#                                                                                   #
#	data = Data on which exp will be evaluated. If data=NULL then the most        #
#			 recent attached (see attach()) data will be used to evaluate exp.      #
#                                                                                   #
#	store= Logical, indicate whether or not elements of data that meet condition  #
#			 exp should be stored in a new object.	                                #
#                                                                                   #
#####################################################################################

countif <- function(exp=expression, data=NULL, store=FALSE){
	
	#if no data are specified then use the attached data
	if(is.null(data)){
		cat("Data was not specified, so using data that was attached last", "\n")
		s <- search()
		data <- get(s[2])
	}#end if(is.null(data))
	
	#evaluate depending on the type of input for "exp"
	if(typeof(exp) == "character"){
		temp <- subset(data, eval(parse(text=exp)))
	} else {
		storage.mode(exp) <- "logical"
		temp <- subset(data, exp)
	}#end if/else(typeof)
	
	#print results
	cat("Count  True = ", nrow(temp), "\n")
	cat("Count False = ", nrow(data) - nrow(temp), "\n")
	
	#if called, store data that meets the condition "exp" for return
	if(store){
		cond.data <- as.data.frame(temp)
		return(cond.data)
	}#end if(store)
}

Contact SBE
Contact SSU
Sonoma State University School of Business and Economics. Reach.
1801 East Cotati Avenue, Rohnert Park, CA, 94928 • 707-664-2377