### R code to accompany paper:
#
### Atkins, D. C., & Gallop, R. J. (2007). Re-thinking how family researchers 
### model infrequent outcomes: A tutorial on count regression and zero-inflated 
### models.  Journal of Family Psychology.
#
### We assume that you have downloaded an installed R (www.r-project.org).
### Commands below can be pasted into R from any text editor (e.g., MSWord),
### though Tinn-R is a convenient text editor with features to send commands
### to R and color-coded highlighting of R commands; it can be downloaded here:
#
### http://www.sciviews.org/Tinn-R/
#
### The analyses below use several user-contributed libraries; if you have R
### installed and have an internet connection, the necessary libraries should
### be installed via:
install.packages("pscl")
install.packages("car")

### To see all available packages (which is 707 as of April 2006):
available.packages()

### Descriptions of user-contributed packages can be found at the R website
### (follow the links to "Download" and then to "Contributed extension packages"

### To read in the example data:
couple.df <- read.table(file = file.choose(), header = TRUE)

### Summary of data
summary(couple.df)

### gender and infidelity are categorical and should be specified as such to R
couple.df$gender <- factor(x = couple.df$gender, 
                          levels = 1:2, labels = c("Wife","Husband"))
couple.df$infidelity <- factor(x = couple.df$infidelity, 
                          levels = 0:1, labels = c("No infidelity","Infidelity"))
                          
### Histogram of DV
hist(couple.df$msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), 
  xlab = "Steps taken toward divorce", ylab = "Probability", main = "")

### Figure 2
pois.0.5 <- dpois(0:12, lambda=0.5)
pois.1.5 <- dpois(0:12, lambda=1.5)
pois.3 <- dpois(0:12, lambda=3)
pois.6 <- dpois(0:12, lambda=6)

par(mfrow=c(2,2))
plot(pois.0.5, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)",
    main = expression(paste("Poisson Distribution: ", italic(mu), "= 0.5", sep=" ")))
plot(pois.1.5, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)",
    main = expression(paste("Poisson Distribution: ", italic(mu), " = 1.5", sep=" ")))
plot(pois.3, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)",
    main = expression(paste("Poisson Distribution: ", italic(mu), " = 3", sep=" ")))
plot(pois.6, type = "b", lwd=2, xlab = "Y", ylab = "Pr(Y)",
    main = expression(paste("Poisson Distribution: ", italic(mu), " = 6", sep=" ")))

### Center predictors prior to regression
couple.df$das.c <- with(couple.df, das - mean(das))
couple.df$afc.c <- with(couple.df, afc - mean(afc))
couple.df$sex.c <- with(couple.df, sex - mean(sex))

### Check contrasts for binary predictors (R creates contrasts automatically
### for categorical predictor, with default 0/1 coding)
with(couple.df, {
  print(contrasts(gender))
  print(contrasts(infidelity))
  })
  
### Fit OLS regression 
msi.lm <- lm(msi ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df)
summary(msi.lm)

### default plots to check assumptions
par(mfrow=c(3,2))
plot(msi.lm, which=1:6)

### Fit OLS regression with log-transformed outcome
msi.lm2 <- lm(log(msi + 0.5) ~ das.c*infidelity + gender + afc.c + sex.c, data = couple.df)
summary(msi.lm2)

### default plots to check assumptions
par(mfrow=c(3,2))
plot(msi.lm2, which=1:6)  # NOTE: Assumptions still don't look that great

### Fit poisson regression
msi.glm <- glm(msi ~ das.c*infidelity + gender + afc.c + sex.c, 
                  data = couple.df,
                  family = poisson)
summary(msi.glm, cor = TRUE)
confint(msi.glm, level = 0.95)

### Analysis of deviance table
anova(msi.glm, test = "Chisq")

### Table 1 (extract first 3 columns of coefficients table and round for 
### both Poisson and NB models)
#
### NOTE: need to fit negative binomial regression first (below)
cbind(round(summary(msi.glm)$coefficients[,1:3], 3),
      round(summary(msi.glm.nb)$coefficients[,1:3], 3))

### interpreting coefficients
exp(coef(msi.glm))                          # raise all to power of e
100*(exp(coef(msi.glm)["afc.c"]) - 1)       # percent change for afc

### get SD
apply(couple.df, 2, sd, na.rm=TRUE)
100*(exp(coef(msi.glm)["afc.c"]*6.85) - 1)  # percent change for 1 SD of afc

### Get predictions and plot based on regression equation
#
### To understand some of the code below, see:
help(expand.grid)
help(predict.glm)

with(couple.df, quantile(das.c, probs = c(0.025,0.975)))

newdata <- expand.grid(gender = "Wife",
                       infidelity = c("No infidelity","Infidelity"),
                       sex.c = 0, 
                       afc.c = 0,
                       das.c = -32:24)
### see what newdata looks like
newdata
                       
### Generate predictions from Poisson regression
yhat <- predict(msi.glm, 
                newdata = newdata, 
                se.fit = TRUE, 
                type = "response")

### To do the same with ZIP/ZINB, use the following after loading pscl library
# yhat <- predict(msi.glm2, newdata = newdata,
#                se.fit = TRUE, MC = 2500, conf = .95) 

### Get confidence bands
newdata$yhat <- yhat$fit
newdata$lower <- yhat$fit - 1.96*yhat$se.fit
newdata$upper <- yhat$fit + 1.96*yhat$se.fit

### Confidence bands for ZIP/ZINB
# newdata$yhat <- yhat$yhat
# newdata$lower <- yhat$lower
# newdata$upper <- yhat$upper

## Plotting 95% confidence bands - Figure 3
plot(yhat ~ das.c,
     data = newdata,
     xlab = "Dyadic Adjustment Scale",
     ylab = "Predicted Counts",
     ylim = c(0, max(newdata$upper)),
     type = "n")
with(subset(newdata, infidelity == "Infidelity"), {
        polygon(x = c(das.c, rev(das.c)),
          y = c(lower, rev(upper)),
          border = FALSE,
          col = gray(.75))
})          
with(subset(newdata, infidelity == "No infidelity"), {
        polygon(x = c(das.c, rev(das.c)),
          y = c(lower, rev(upper)),
          border = FALSE,
          col = gray(.75))
})          
with(subset(newdata, infidelity == "Infidelity"), { # lines need to go last so that
        lines(x = das.c,                            # second polygon doesn't obscure
          y = yhat,                                 # the first regression line
          lwd = 2)
          })
with(subset(newdata, infidelity == "No infidelity"), { 
        lines(x = das.c,                      
          y = yhat,                         
          lwd = 2, lty = 2)
          })
legend(-28, 2.5, c("Infidelity","No infidelity"), lty=c(1,2), lwd=2)

### Get predictions at specific values for AFC
with(couple.df, quantile(afc.c, c(0.025, 0.975)))
predict(msi.glm, 
          newdata = expand.grid(das.c = 0, 
                                sex.c = 0, 
                                infidelity = "No infidelity",
                                gender = "Wife", 
                                afc.c = c(-14.3, 12.6)),
          type = "response")

### Diagnostics (some use John Fox's library - car)
library(car)

### check variance inflation factors
vif(msi.glm) # multicollinearity not a problem
  
### Plot Cook's D
plot(cookd(msi.glm))
identify(1:nrow(couple.df), cookd(msi.glm)) # to identify points, hit Esc when done

### Not much to be concerned about, 134 has highest Cook's D
couple.df[134,]  # low DAS with infidelity, but MSI = 0

### All in one influence plot a la Fox
influencePlot(msi.glm) #
help(influence.plot)    # to see what all is reported

### Finally, let's look at DFBETAs
msi.dfb <- dfbeta(msi.glm)
msi.dfb[1:10,] # see first 10 rows
names(msi.dfb)

### plot
par(mfrow=c(2,4))
for (i in 1:7){
  plot(msi.dfb[,i], ylab = paste("DFBETA",colnames(msi.dfb)[i]))
  }

### Most notable on infidelity and interaction
plot(msi.dfb[,3])
identify(1:nrow(couple.df), msi.dfb[,3]) # 127 and 71 have largest

couple.df[127,] ; couple.df[71,] # both husbands with affairs but not too distressed

### Compare model excluding them with original model
summary(update(msi.glm, subset = -c(127, 71)))
summary(msi.glm)

### NOTE: Coefficients shift slightly but same conclusions

### Use Simon Jackman's pscl library
library(pscl)

### Get predicted count probabilities from Poisson regression
phat.pois <- predprob(msi.glm)            # for each subj, prob of observing each value
phat.pois.mn <- apply(phat.pois, 2, mean) # mean predicted probs

### Histogram plot with fitted probabilities
with(couple.df, {
  hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce",
     main = "Histogram of MSI with Unconditional \n and Conditional Poisson Densities")
  lines(x = seq(0, 11, 1), y = dpois(0:11, lambda = mean(msi, na.rm=T)), 
      type = "b", lwd=2, lty=1)
  lines(x = seq(0, 11, 1), y = phat.pois.mn, type = "b", lwd=2, col="red")
})
legend(7, 0.23, c("Unconditional","Conditional"), lty=1, col = c("black","red"), lwd=2)

### Not a great fit due to over-dispersion, look at mean-var relationship
with(couple.df, {
    print(mean(msi, na.rm=T))
    print(var(msi, na.rm=T))  # over 2x larger than mean
    })
    
######################## Negative Binomial ####################################
#
### Negative Binomial regression model is in MASS library
library(MASS)
msi.glm.nb <- glm.nb(msi ~ das.c*infidelity + gender + afc.c + sex.c, 
                        data = couple.df)
summary(msi.glm.nb, correlation = FALSE)
confint(msi.glm.nb, level = 0.95)

### Test of dispersion parameter using asymptotic SE
pnorm(1 - summary(msi.glm.nb)$theta/summary(msi.glm.nb)$SE.theta, lower=TRUE)

### Deviance test
anova(msi.glm, msi.glm.nb) # NOTE: Doesn't count theta in DFs

### Over-dispersion test from pscl library
odTest(msi.glm.nb)

### To compare with Poisson
summary(msi.glm) ; summary(msi.glm.nb, correlation = FALSE)

### Ratio of SE for two models, approximately sqrt(theta)
sqrt(diag(vcov(msi.glm.nb)))/sqrt(diag(vcov(msi.glm)))

### Get predicted count probabilities for NB model
phat.nb <- predprob(msi.glm.nb)       # for each subj, prob of observing each value
phat.nb.mn <- apply(phat.nb, 2, mean) # mean predicted probs

### redo earlier plot with fitted probabilities
with(couple.df, {
  hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce",
     main = "Histogram of MSI with Conditional Poisson \n and Negative Binomial Densities")
  lines(x = seq(0, 11, 1), y = phat.pois.mn, type = "b", lwd=2, col="red")
  lines(x = seq(0, 11, 1), y = phat.nb.mn, type = "b", lwd=2, col="blue")
})
legend(7, 0.23, c("Poisson","NB"), lty=1, col = c("red","blue"), lwd=2)

####################### ZIP Model ##############################################
#
### ZIP and ZINB models in pscl library; also, see zicounts library for another
### zero-inflated possibility
library(pscl)

### NOTE: the model statement before the pipe "|" are the count predictors and
###       after the pipe are the logistic predictors; these can be different
msi.zip <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c |
                    das.c*infidelity + gender + afc.c + sex.c,
                data = couple.df,
                link = "logit",
                dist = "poisson",
                trace = TRUE, EM = TRUE)
summary(msi.zip)
confint(msi.zip)

### Interpret logistic coefficients on odds-ratio scale
exp(coef(msi.zip)[8:14])

### Interpret poisson coefficients on percent scale
100*(exp(coef(msi.zip)[1:7]) - 1)   # 1st - 7th coefficients
100*(exp(coef(msi.zip)[2]*14.4) - 1) # 2nd coefficient (DAS)

### for ZIP model
phat.zip <- predprob(msi.zip)           # for each subj, prob of observing each value
phat.zip.mn <- apply(phat.zip, 2, mean) # mean predicted probs

####################### ZINB Model ##############################################
#
msi.zinb <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c |
                    das.c*infidelity + gender + afc.c + sex.c,
                    data = couple.df,
                    link = "logit",
                    dist = "negbin",
                    trace = TRUE, EM = TRUE)
summary(msi.zinb)
confint(msi.zinb)

### deviance test
1 - pchisq(2*(msi.zip$loglik - msi.zinb$loglik), df=1)

### for ZINB model
phat.zinb <- predprob(msi.zinb)           # for each subj, prob of observing each value
phat.zinb.mn <- apply(phat.zinb, 2, mean) # mean predicted probs

### redo earlier plot with fitted probabilities
with(couple.df, {
  hist(msi, prob = TRUE, col = "yellow", breaks=seq(-0.5, 11.5, 1), xlab = "Steps taken toward divorce",
     main = "Histogram of MSI with Conditional ZIP and ZINB Densities")
  lines(x = seq(0, 11, 1), y = phat.zip.mn, type = "b", lwd=2, col="red")
  lines(x = seq(0, 11, 1), y = phat.zinb.mn, type = "b", lwd=2, col="blue")
})
legend(7, 0.23, c("ZIP","ZINB"), lty=1, col = c("red","blue"), lwd=2)

### Vuong test in pscl library
vuong(msi.glm.nb, msi.zip)  # strongly prefers zip
vuong(msi.glm.nb, msi.zinb) # strongly prefers zinb
vuong(msi.zip, msi.zinb)    # does *not* prefer zinb

#################### Compare Models by Plotting Residual Probabilities #########
#
### we didn't have enough space to leave this in the ms.  see scott long's book
### for details
#
### get original densities and predicted probs in a single df
pred.df <- data.frame(original = table(couple.df$msi)/nrow(couple.df),
                    pois = phat.pois.mn, 
                    nb = phat.nb.mn,
                    zip = phat.zip.mn,
                    zinb = phat.zinb.mn)
pred.df

### get residual probabilities
pred.resid <- pred.df[,3:6] - pred.df[,2]

### plot
matplot(pred.resid, type = "b", lwd=2, pch = 1:4,
        ylab = "Predicted - Observed", xlab = "Counts") #NOTE: actually counts + 1
abline(h=0)
legend(10, .08, c("POIS","NB","ZIP","ZINB"), lty = 1:4, lwd=2, cex = 1, col=1:4, pch=1:4) 

################## Generalized Linear Mixed Model Poisson Regression ###########
library(lme4)

msi.lmer <- lmer(msi ~ das.c*infidelity + gender + afc.c + sex.c + (1|id), 
                        data = couple.df,
                        family = poisson,
                        method = "Laplace",
                        na.action = na.omit,
                        control = list(EMverbose = TRUE, msVerbose = TRUE))
summary(msi.lmer)

### NOTE: at the time of this writing (5/9/07), the lme4 package is actively
###       in development.
 
######################## Bootstrap Fitting of ZIP/ZINB Models ###############
#
### bootstrap functions are in boot library
library(boot)

### need function that takes a data argument and indices, which in our case will
### sample rows of data matrix; finally, it should return vector of regression
### coefficients
boot.zinb <- function(data, indices){
                  require(pscl)               # make sure zeroinfl() is available
                  data <- data[indices,]      # select obs. in bootstrap sample
                  try(mod <- zeroinfl(msi ~ das.c*infidelity + gender + afc.c + sex.c |
                            das.c*infidelity + gender + afc.c + sex.c,
                            data = data,      # note: use "data" not original data.frame
                            link = "logit",
                            dist = "negbin",
                            trace = FALSE,    # turn off printing iterations
                            EM = TRUE))
                  if (exists("mod")) {
                    coef(mod)
                    } else {
                    NA
                    }                   # return coefficient vector
              }

system.time(zinb.boot.out <- boot(data = couple.df, statistic = boot.zinb, R = 99))

### NOTE: just over 10 minutes on 2.2 Ghz CPU with 760 MB RAM for 99 iterations
### NOTE: just under 60 minutes on 3.6 Ghz CPU with 2 GB RAM for 999 iterations

zinb.boot.out     # basic output
summary(msi.zinb) # compare with original coefficients and SE

### for histogram and QQplot for a given component
plot(zinb.boot.out, index = 1)

### histograms of all components
par(mfrow=c(3,5))
for (i in 1:14) {
  hist(zinb.boot.out$t[,i], col = "yellow", main = "",
      xlab = names(coef(msi.zinb))[i])
  abline(v = zinb.boot.out$t0[i], lwd=2)    # add reference line at original value
  }
  
########################### ZINB Mixed Models ##################################
#
### the following code uses the functions by Drs. Andy Lee and Kelvin Yau
### to fit a zero-inflated negative-binomial regression with up two (possible)
### random-effects: 1) in the logistic portion of the model, and 2) in the 
### counts portion of the model.
#
### first, source in the R code from the file "R - Multilevel ZINB.R"
source("R - Multilevel ZINB.txt")

### the sumzinbmix function requires the following arguments:
#
### y = vector with outcome variable
### x.l = matrix of predictors for logistic portion
### x.nb = matrix of predictors for negative-binomial portion
### random = vector with grouping variable
### r.l = TRUE/FALSE: random-effect in logistic portion
### r.nb = TRUE/FALSE: random-effect in negative-binomial portion

### use same predictors in both portions; model.matrix() function will
### allow us to use formula interface to generate predictor matrix,
### though we need to pull out the intercept
x.mat <- model.matrix(~ das.c*infidelity + gender + afc.c + sex.c,
                          data = couple.df)[,-1]

### you might wish to turn off buffered output (/Misc/Buffered Output)
### so that you can see the iterations in real-time
#                          
### model with no random-effects
msi.zinbmix <- sumzinbmix(y = couple.df$msi,
                           x.nb = x.mat,
                           x.l = x.mat,
                           random = couple.df$id,
                           r.nb = FALSE, r.l = FALSE)
### random-intercept logistic
msi.zinbmix2 <- sumzinbmix(y = couple.df$msi,
                           x.nb = x.mat,
                           x.l = x.mat,
                           random = couple.df$id,
                           r.nb = FALSE, r.l = TRUE)
### random-intercept both
msi.zinbmix3 <- sumzinbmix(y = couple.df$msi,
                           x.nb = x.mat,
                           x.l = x.mat,
                           random = couple.df$id,
                           r.nb = TRUE, r.l = TRUE)

### examine log-likelihoods
msi.zinbmix$loglik ; msi.zinbmix2$loglik ; msi.zinbmix3$loglik

### deviance test for two effect model compared to no random-effects
1 - pchisq(2*(msi.zinbmix3$loglik - msi.zinbmix$loglik), df=2)

### coefficients from final model
zinb.coef(msi.zinbmix3)

### components of fitted object
str(msi.zinbmix3)

### random-effects variance for logistic
msi.zinbmix3$sigu

### random-effects variance for NB
msi.zinbmix3$sigv

### BLUPs of random-effects for logistic
msi.zinbmix3$risku

### QQplot and histogram
par(mfrow = c(1,2))
qqnorm(msi.zinbmix3$risku)
qqline(msi.zinbmix3$risku)
hist(msi.zinbmix3$risku, col = "yellow")

### NOTE: distributions skewed, which violates normality assumption

### BLUPs of random-effects for NB
msi.zinbmix3$riskv

### QQplot and histogram
par(mfrow = c(1,2))
qqnorm(msi.zinbmix3$riskv)
qqline(msi.zinbmix3$riskv)
hist(msi.zinbmix3$riskv, col = "yellow")

### NOTE: reasonably normally distributed