### R code to accompany paper: # ### Atkins, D. C., & Gallop, R. J. (2007). Re-thinking how family researchers ### model infrequent outcomes: A tutorial on count regression and zero-inflated ### models. Journal of Family Psychology. # ### We assume that you have downloaded an installed R (www.r-project.org). ### Commands below can be pasted into R from any text editor (e.g., MSWord), ### though Tinn-R is a convenient text editor with features to send commands ### to R and color-coded highlighting of R commands; it can be downloaded here: # ### http://www.sciviews.org/Tinn-R/ # ### The analyses below use several user-contributed libraries; if you have R ### installed and have an internet connection, the necessary libraries should ### be installed via: install.packages("pscl") install.packages("car") ### To see all available packages (which is 707 as of April 2006): available.packages() ### Descriptions of user-contributed packages can be found at the R website ### (follow the links to "Download" and then to "Contributed extension packages" ### To read in the example data: couple.df <- read.table(file = file.choose(), header = TRUE) ### Summary of data summary(couple.df) ### gender and infidelity are categorical and should be specified as such to R couple.df$gender <- factor(x = couple.df$gender, levels = 1:2, labels = c("Wife","Husband")) couple.df$infidelity <- factor(x = couple.df$infidelity, levels = 0:1, labels = c("No infidelity","Infidelity")) ### Histogram of DV hist(couple.df$msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce", ylab = "Probability", main = "") ### Figure 2 pois.0.5 <- dpois(0:12, lambda=0.5) pois.1.5 <- dpois(0:12, lambda=1.5) pois.3 <- dpois(0:12, lambda=3) pois.6 <- dpois(0:12, lambda=6) par(mfrow=c(2,2)) plot(pois.0.5, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)", main = expression(paste("Poisson Distribution: ", italic(mu), "= 0.5", sep=" "))) plot(pois.1.5, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)", main = expression(paste("Poisson Distribution: ", italic(mu), " = 1.5", sep=" "))) plot(pois.3, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)", main = expression(paste("Poisson Distribution: ", italic(mu), " = 3", sep=" "))) plot(pois.6, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)", main = expression(paste("Poisson Distribution: ", italic(mu), " = 6", sep=" "))) ### Center predictors prior to regression couple.df$das.c <- with(couple.df, das - mean(das)) couple.df$afc.c <- with(couple.df, afc - mean(afc)) couple.df$sex.c <- with(couple.df, sex - mean(sex)) ### Check contrasts for binary predictors (R creates contrasts automatically ### for categorical predictor, with default 0/1 coding) with(couple.df, { print(contrasts(gender)) print(contrasts(infidelity)) }) ### Fit OLS regression msi.lm <- lm(msi ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df) summary(msi.lm) ### default plots to check assumptions par(mfrow=c(3,2)) plot(msi.lm, which=1:6) ### Fit OLS regression with log-transformed outcome msi.lm2 <- lm(log(msi + 0.5) ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df) summary(msi.lm2) ### default plots to check assumptions par(mfrow=c(3,2)) plot(msi.lm2, which=1:6) # NOTE: Assumptions still don't look that great ### Fit poisson regression msi.glm <- glm(msi ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df, family = poisson) summary(msi.glm, cor = TRUE) confint(msi.glm, level = 0.95) ### Analysis of deviance table anova(msi.glm, test = "Chisq") ### Table 1 (extract first 3 columns of coefficients table and round for ### both Poisson and NB models) # ### NOTE: need to fit negative binomial regression first (below) cbind(round(summary(msi.glm)$coefficients[,1:3], 3), round(summary(msi.glm.nb)$coefficients[,1:3], 3)) ### interpreting coefficients exp(coef(msi.glm)) # raise all to power of e 100*(exp(coef(msi.glm)["afc.c"]) - 1) # percent change for afc ### get SD apply(couple.df, 2, sd, na.rm=TRUE) 100*(exp(coef(msi.glm)["afc.c"]*6.85) - 1) # percent change for 1 SD of afc ### Get predictions and plot based on regression equation # ### To understand some of the code below, see: help(expand.grid) help(predict.glm) with(couple.df, quantile(das.c, probs = c(0.025,0.975))) newdata <- expand.grid(gender = "Wife", infidelity = c("No infidelity","Infidelity"), sex.c = 0, afc.c = 0, das.c = -32:24) ### see what newdata looks like newdata ### Generate predictions from Poisson regression yhat <- predict(msi.glm, newdata = newdata, se.fit = TRUE, type = "response") ### To do the same with ZIP/ZINB, use the following after loading pscl library # yhat <- predict(msi.glm2, newdata = newdata, # se.fit = TRUE, MC = 2500, conf = .95) ### Get confidence bands newdata$yhat <- yhat$fit newdata$lower <- yhat$fit - 1.96*yhat$se.fit newdata$upper <- yhat$fit + 1.96*yhat$se.fit ### Confidence bands for ZIP/ZINB # newdata$yhat <- yhat$yhat # newdata$lower <- yhat$lower # newdata$upper <- yhat$upper ## Plotting 95% confidence bands - Figure 3 plot(yhat ~ das.c, data = newdata, xlab = "Dyadic Adjustment Scale", ylab = "Predicted Counts", ylim = c(0, max(newdata$upper)), type = "n") with(subset(newdata, infidelity == "Infidelity"), { polygon(x = c(das.c, rev(das.c)), y = c(lower, rev(upper)), border = FALSE, col = gray(.75)) }) with(subset(newdata, infidelity == "No infidelity"), { polygon(x = c(das.c, rev(das.c)), y = c(lower, rev(upper)), border = FALSE, col = gray(.75)) }) with(subset(newdata, infidelity == "Infidelity"), { # lines need to go last so that lines(x = das.c, # second polygon doesn't obscure y = yhat, # the first regression line lwd = 2) }) with(subset(newdata, infidelity == "No infidelity"), { lines(x = das.c, y = yhat, lwd = 2, lty = 2) }) legend(-28, 2.5, c("Infidelity","No infidelity"), lty=c(1,2), lwd=2) ### Get predictions at specific values for AFC with(couple.df, quantile(afc.c, c(0.025, 0.975))) predict(msi.glm, newdata = expand.grid(das.c = 0, sex.c = 0, infidelity = "No infidelity", gender = "Wife", afc.c = c(-14.3, 12.6)), type = "response") ### Diagnostics (some use John Fox's library - car) library(car) ### check variance inflation factors vif(msi.glm) # multicollinearity not a problem ### Plot Cook's D plot(cookd(msi.glm)) identify(1:nrow(couple.df), cookd(msi.glm)) # to identify points, hit Esc when done ### Not much to be concerned about, 134 has highest Cook's D couple.df[134,] # low DAS with infidelity, but MSI = 0 ### All in one influence plot a la Fox influencePlot(msi.glm) # help(influence.plot) # to see what all is reported ### Finally, let's look at DFBETAs msi.dfb <- dfbeta(msi.glm) msi.dfb[1:10,] # see first 10 rows names(msi.dfb) ### plot par(mfrow=c(2,4)) for (i in 1:7){ plot(msi.dfb[,i], ylab = paste("DFBETA",colnames(msi.dfb)[i])) } ### Most notable on infidelity and interaction plot(msi.dfb[,3]) identify(1:nrow(couple.df), msi.dfb[,3]) # 127 and 71 have largest couple.df[127,] ; couple.df[71,] # both husbands with affairs but not too distressed ### Compare model excluding them with original model summary(update(msi.glm, subset = -c(127, 71))) summary(msi.glm) ### NOTE: Coefficients shift slightly but same conclusions ### Use Simon Jackman's pscl library library(pscl) ### Get predicted count probabilities from Poisson regression phat.pois <- predprob(msi.glm) # for each subj, prob of observing each value phat.pois.mn <- apply(phat.pois, 2, mean) # mean predicted probs ### Histogram plot with fitted probabilities with(couple.df, { hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce", main = "Histogram of MSI with Unconditional \n and Conditional Poisson Densities") lines(x = seq(0, 11, 1), y = dpois(0:11, lambda = mean(msi, na.rm=T)), type = "b", lwd=2, lty=1) lines(x = seq(0, 11, 1), y = phat.pois.mn, type = "b", lwd=2, col="red") }) legend(7, 0.23, c("Unconditional","Conditional"), lty=1, col = c("black","red"), lwd=2) ### Not a great fit due to over-dispersion, look at mean-var relationship with(couple.df, { print(mean(msi, na.rm=T)) print(var(msi, na.rm=T)) # over 2x larger than mean }) ######################## Negative Binomial #################################### # ### Negative Binomial regression model is in MASS library library(MASS) msi.glm.nb <- glm.nb(msi ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df) summary(msi.glm.nb, correlation = FALSE) confint(msi.glm.nb, level = 0.95) ### Test of dispersion parameter using asymptotic SE pnorm(1 - summary(msi.glm.nb)$theta/summary(msi.glm.nb)$SE.theta, lower=TRUE) ### Deviance test anova(msi.glm, msi.glm.nb) # NOTE: Doesn't count theta in DFs ### Over-dispersion test from pscl library odTest(msi.glm.nb) ### To compare with Poisson summary(msi.glm) ; summary(msi.glm.nb, correlation = FALSE) ### Ratio of SE for two models, approximately sqrt(theta) sqrt(diag(vcov(msi.glm.nb)))/sqrt(diag(vcov(msi.glm))) ### Get predicted count probabilities for NB model phat.nb <- predprob(msi.glm.nb) # for each subj, prob of observing each value phat.nb.mn <- apply(phat.nb, 2, mean) # mean predicted probs ### redo earlier plot with fitted probabilities with(couple.df, { hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce", main = "Histogram of MSI with Conditional Poisson \n and Negative Binomial Densities") lines(x = seq(0, 11, 1), y = phat.pois.mn, type = "b", lwd=2, col="red") lines(x = seq(0, 11, 1), y = phat.nb.mn, type = "b", lwd=2, col="blue") }) legend(7, 0.23, c("Poisson","NB"), lty=1, col = c("red","blue"), lwd=2) ####################### ZIP Model ############################################## # ### ZIP and ZINB models in pscl library; also, see zicounts library for another ### zero-inflated possibility library(pscl) ### NOTE: the model statement before the pipe "|" are the count predictors and ### after the pipe are the logistic predictors; these can be different msi.zip <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c | das.c*infidelity + gender + afc.c + sex.c, data = couple.df, link = "logit", dist = "poisson", trace = TRUE, EM = TRUE) summary(msi.zip) confint(msi.zip) ### Interpret logistic coefficients on odds-ratio scale exp(coef(msi.zip)[8:14]) ### Interpret poisson coefficients on percent scale 100*(exp(coef(msi.zip)[1:7]) - 1) # 1st - 7th coefficients 100*(exp(coef(msi.zip)[2]*14.4) - 1) # 2nd coefficient (DAS) ### for ZIP model phat.zip <- predprob(msi.zip) # for each subj, prob of observing each value phat.zip.mn <- apply(phat.zip, 2, mean) # mean predicted probs ####################### ZINB Model ############################################## # msi.zinb <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c | das.c*infidelity + gender + afc.c + sex.c, data = couple.df, link = "logit", dist = "negbin", trace = TRUE, EM = TRUE) summary(msi.zinb) confint(msi.zinb) ### deviance test 1 - pchisq(2*(msi.zip$loglik - msi.zinb$loglik), df=1) ### for ZINB model phat.zinb <- predprob(msi.zinb) # for each subj, prob of observing each value phat.zinb.mn <- apply(phat.zinb, 2, mean) # mean predicted probs ### redo earlier plot with fitted probabilities with(couple.df, { hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce", main = "Histogram of MSI with Conditional ZIP and ZINB Densities") lines(x = seq(0, 11, 1), y = phat.zip.mn, type = "b", lwd=2, col="red") lines(x = seq(0, 11, 1), y = phat.zinb.mn, type = "b", lwd=2, col="blue") }) legend(7, 0.23, c("ZIP","ZINB"), lty=1, col = c("red","blue"), lwd=2) ### Vuong test in pscl library vuong(msi.glm.nb, msi.zip) # strongly prefers zip vuong(msi.glm.nb, msi.zinb) # strongly prefers zinb vuong(msi.zip, msi.zinb) # does *not* prefer zinb #################### Compare Models by Plotting Residual Probabilities ######### # ### we didn't have enough space to leave this in the ms. see scott long's book ### for details # ### get original densities and predicted probs in a single df pred.df <- data.frame(original = table(couple.df$msi)/nrow(couple.df), pois = phat.pois.mn, nb = phat.nb.mn, zip = phat.zip.mn, zinb = phat.zinb.mn) pred.df ### get residual probabilities pred.resid <- pred.df[,3:6] - pred.df[,2] ### plot matplot(pred.resid, type = "b", lwd=2, pch = 1:4, ylab = "Predicted - Observed", xlab = "Counts") #NOTE: actually counts + 1 abline(h=0) legend(10, .08, c("POIS","NB","ZIP","ZINB"), lty = 1:4, lwd=2, cex = 1, col=1:4, pch=1:4) ################## Generalized Linear Mixed Model Poisson Regression ########### library(lme4) msi.lmer <- lmer(msi ~ das.c*infidelity + gender + afc.c + sex.c + (1|id), data = couple.df, family = poisson, method = "Laplace", na.action = na.omit, control = list(EMverbose = TRUE, msVerbose = TRUE)) summary(msi.lmer) ### NOTE: at the time of this writing (5/9/07), the lme4 package is actively ### in development. ######################## Bootstrap Fitting of ZIP/ZINB Models ############### # ### bootstrap functions are in boot library library(boot) ### need function that takes a data argument and indices, which in our case will ### sample rows of data matrix; finally, it should return vector of regression ### coefficients boot.zinb <- function(data, indices){ require(pscl) # make sure zeroinfl() is available data <- data[indices,] # select obs. in bootstrap sample try(mod <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c | das.c*infidelity + gender + afc.c + sex.c, data = data, # note: use "data" not original data.frame link = "logit", dist = "negbin", trace = FALSE, # turn off printing iterations EM = TRUE)) if (exists("mod")) { coef(mod) } else { NA } # return coefficient vector } system.time(zinb.boot.out <- boot(data = couple.df, statistic = boot.zinb, R = 99)) ### NOTE: just over 10 minutes on 2.2 Ghz CPU with 760 MB RAM for 99 iterations ### NOTE: just under 60 minutes on 3.6 Ghz CPU with 2 GB RAM for 999 iterations zinb.boot.out # basic output summary(msi.zinb) # compare with original coefficients and SE ### for histogram and QQplot for a given component plot(zinb.boot.out, index = 1) ### histograms of all components par(mfrow=c(3,5)) for (i in 1:14) { hist(zinb.boot.out$t[,i], col = "yellow", main = "", xlab = names(coef(msi.zinb))[i]) abline(v = zinb.boot.out$t0[i], lwd=2) # add reference line at original value } ########################### ZINB Mixed Models ################################## # ### the following code uses the functions by Drs. Andy Lee and Kelvin Yau ### to fit a zero-inflated negative-binomial regression with up two (possible) ### random-effects: 1) in the logistic portion of the model, and 2) in the ### counts portion of the model. # ### first, source in the R code from the file "R - Multilevel ZINB.R" source("R - Multilevel ZINB.txt") ### the sumzinbmix function requires the following arguments: # ### y = vector with outcome variable ### x.l = matrix of predictors for logistic portion ### x.nb = matrix of predictors for negative-binomial portion ### random = vector with grouping variable ### r.l = TRUE/FALSE: random-effect in logistic portion ### r.nb = TRUE/FALSE: random-effect in negative-binomial portion ### use same predictors in both portions; model.matrix() function will ### allow us to use formula interface to generate predictor matrix, ### though we need to pull out the intercept x.mat <- model.matrix(~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df)[,-1] ### you might wish to turn off buffered output (/Misc/Buffered Output) ### so that you can see the iterations in real-time # ### model with no random-effects msi.zinbmix <- sumzinbmix(y = couple.df$msi, x.nb = x.mat, x.l = x.mat, random = couple.df$id, r.nb = FALSE, r.l = FALSE) ### random-intercept logistic msi.zinbmix2 <- sumzinbmix(y = couple.df$msi, x.nb = x.mat, x.l = x.mat, random = couple.df$id, r.nb = FALSE, r.l = TRUE) ### random-intercept both msi.zinbmix3 <- sumzinbmix(y = couple.df$msi, x.nb = x.mat, x.l = x.mat, random = couple.df$id, r.nb = TRUE, r.l = TRUE) ### examine log-likelihoods msi.zinbmix$loglik ; msi.zinbmix2$loglik ; msi.zinbmix3$loglik ### deviance test for two effect model compared to no random-effects 1 - pchisq(2*(msi.zinbmix3$loglik - msi.zinbmix$loglik), df=2) ### coefficients from final model zinb.coef(msi.zinbmix3) ### components of fitted object str(msi.zinbmix3) ### random-effects variance for logistic msi.zinbmix3$sigu ### random-effects variance for NB msi.zinbmix3$sigv ### BLUPs of random-effects for logistic msi.zinbmix3$risku ### QQplot and histogram par(mfrow = c(1,2)) qqnorm(msi.zinbmix3$risku) qqline(msi.zinbmix3$risku) hist(msi.zinbmix3$risku, col = "yellow") ### NOTE: distributions skewed, which violates normality assumption ### BLUPs of random-effects for NB msi.zinbmix3$riskv ### QQplot and histogram par(mfrow = c(1,2)) qqnorm(msi.zinbmix3$riskv) qqline(msi.zinbmix3$riskv) hist(msi.zinbmix3$riskv, col = "yellow") ### NOTE: reasonably normally distributed