#############################################################################
############             R Skript Visitor Intervention            ###########
#############################################################################

require(foreign)
require(ggplot2)
require(stats)
require(car)
require(plyr)
require(pastecs)
require(pscl)
require(sandwich)
require(MASS)
require(emmeans)
require(sjmisc)
require(psych)
require(lmtest)

setwd("C:/Users/Susanne Gaube/Dropbox/01.Promotion/02. Research/05_HH Visitors Intervention")
data <- read.spss("Data.sav", to.data.frame = T, use.value.labels = F)

# Looking at data file  
head(data)
table(data$Condition)

#  Removing intervals in which usage rates > 100 to NA and removing intervals without traffic
dat <- data
summary(dat$Rate > 100)
dat[which(dat$Rate > 100), "Rate"] <- NA
dat <- dat[!is.na(dat$Rate),]
summary(dat$Rate)

round(sum(dat$Traffic), 0)
round(sum(dat$Usage), 0)

sum(dat$DispenserIn)
sum(dat$DispenserOut)

round(describe(dat$Rate),1)
round(describe(dat.c$Rate),1)
round(describe(dat.e$Rate),1)

# Including variable for median split
round(describe(dat$Traffic),1)
dat$H.L.Traffic <- dicho(dat$Traffic, dich.by = "median")

#Including variable for relative increases
dat$Increase <- (dat$Rate * 100 / 6.0) -100
Increase_Mean <- round(tapply(dat$Increase, dat$Condition, mean), 1)


# Labeling Control (Baseline) and Experimental group
is.even <- function(x) x %% 2 == 0
dat$Group <- "Control"
dat$Group[is.even(dat$Condition)] <- "Experimental"

## Looking at dispenser usage rate
hist(dat$Rate, 
      main="Histogram for Dispenser Usage Rates", 
      xlab="Dispenser Usage Rates in %", 
      ylab="Number of 15-minute intervals ",
      xlim=c(0,100),
      breaks=50)
## -> not normally distributed

#-----------------------------------------------------------------------------------------------
# Looking whether the dispenser usage differes between the baseline weeks
## Creating a data file with only baseline week data
dat.c <- dat[which(dat$Group == "Control"), ]

dat.c$Condition <- as.factor(dat.c$Condition)

dat.c$Condition <- factor(dat.c$Condition, 
                          levels = c(1,3,5,7,9,11,13), 
                          labels = c("W1_Baseline", "W2_Baseline", 
                                     "W3_Baseline", "W4_Baseline", 
                                     "W5_Baseline", "W6_Baseline", 
                                     "W7_Baseline"))

## Descriptive statistics
sum(dat.c$Traffic)
by(dat.c$Traffic , dat.c$Condition, sum)

sum(dat.c$Usage)
round(by(dat.c$Usage , dat.c$Condition, sum), 0)

round(tapply(dat.c$Rate, dat.c$Condition, mean),1)
round(tapply(dat.c$Rate, dat.c$Condition, sd),1)

table(dat.c$Condition)


## Distribution of dispenser usage rate
hist(dat.c$Rate, 
     main="Histogram for Usage Rates", 
     xlab="Usage Rates", 
     xlim=c(0,100),
     breaks=50)
## -> not normally distributed

shapiro.test(dat.c$Rate) # not normally distributed


##Plotting the baseline weeks 
a <- data.frame(dat.c$Rate, dat.c$Condition)
a$Rate <- as.numeric(dat.c$Rate)
a$Condition <- as.factor(dat.c$Condition)

### mean used for the blue line in the graph
round(mean(dat.c$Rate),2)
round(sd(dat.c$Rate),2)

graph.a <- ggplot(a, aes(Condition, Rate)) + 
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) +
  theme_bw() +
  labs(x = "Baseline Weeks", y = "Mean Dispenser Usage in %") + 
  geom_hline(yintercept = 6.0, color="blue", linetype = 2) +
  expand_limits(y=c(4,9)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
graph.a

ggsave("GaubeFig1.jpeg", width = 6.885, height = 5.16, dpi = 600)
ggsave("Fig1.tiff", width = 6.885, height = 5.16, dpi = 600)


# Regression Models only baseline weeks 

## Normal linear regression -> not appropriate because data is not normally distributed
normal_baseline <- lm(Rate ~ Condition, data = dat.c)
summary.lm(normal_baseline)


## ANOVA -> not appropriate because data is not normally distributed
car::leveneTest(dat.c$Rate, dat.c$Condition, center = mean)

anova_aov_baseline <- aov(Rate ~ Condition, data = dat.c)
summary.lm(anova_aov_baseline)


## Poisson regression  
poisson_baseline <- glm(Rate ~ Condition, data = dat.c, family = "poisson")
summary(poisson_baseline)
### Testing for overdispersion 
dispersiontest(poisson_baseline) # there is overdispersion


## Negative binomial regression
nbr_baseline <- glm.nb(Rate ~ Condition, data = dat.c)
summary(nbr_baseline)
BIC(nbr_baseline)
logLik(nbr_baseline)

ref_grid_nbr_baseline <- ref_grid(nbr_baseline)
summary(.Last.ref_grid)
post_hoc_nbr_baseline <- emmeans(nbr_baseline, ~ Condition, contr = "tukey")

baseline_table <- round((est <- cbind(b = coef(nbr_baseline), 
                                      se = coef(summary(nbr_baseline))[,2],
                                      z = coef(summary(nbr_baseline))[,3], 
                                      p = coef(summary(nbr_baseline))[,4],
                                      eb = exp(coef(nbr_baseline)),
                                      CI = exp(confint(nbr_baseline)))),3)

write.csv(baseline_table,'baseline_table.csv')



#-------------------------------------------------------------------------------------------------------
# Experimental weeks
## Creating a data file with only Experimental weeks
dat.e <- dat[which(dat$Group == "Experimental"), ]

## Descriptive statistics
round(sum(dat.e$Traffic),0)
round(by(dat.e$Traffic , dat.e$Condition, sum),0)

round(sum(dat.e$Usage), 0)
round(by(dat.e$Usage , dat.e$Condition, sum), 0)

round(tapply(dat.e$Rate, dat.e$Condition, mean), 1)
round(tapply(dat.e$Rate, dat.e$Condition, sd), 1)

table(dt$Exp)


##Plotting the Experimental weeks
c <- data.frame(dat.e$Rate, dat.e$Conditionnames)
c$Rate <- as.numeric(dat.e$Rate)
c$Exp <- as.factor(dat.e$Conditionnames)

graph.c <- ggplot(c, aes(Exp, Rate)) + 
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2) + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  +
  labs(x = "Experimental Weeks", y = "Mean Dispenser Usage in %") + 
  geom_hline(yintercept = 6.0, color="blue", linetype = 2) +
  expand_limits(y=c(4,9)) 
graph.c

ggsave("GaubeFig2.jpeg", width = 6.885, height = 5.16, dpi = 600)
ggsave("Fig2.tiff", width = 6.885, height = 5.16, dpi = 600)



# Regression over the combined Baseline against all Experimental weeks 
## Creating a data file with Baseline and Experimental weeks data
dt <- data.frame(Rate = dat$Rate, Condition = dat$Condition, Group = dat$Group, Exp = dat$Conditionnames, 
                 Traffic = dat$Traffic)


## Normal linear regression -> not appropriate because data is not normally distributed
normal <- lm(Rate ~ Exp, data = dt)
summary.lm(normal)


## ANOVA -> not appropriate because data is not normally distributed
car::leveneTest(dt$Rate, dt$Exp, center = mean)

anova <- aov(Rate ~ Exp, data = dt)
summary.lm(anova)


## Poisson regression
poisson <- glm(Rate ~ Exp, data = dt, family = "poisson")
summary(poisson)
### Testing for overdispersion 
dispersiontest(poisson) # there is overdispersion


## Negative binomial regression without traffic
nbr <- glm.nb(Rate ~ Exp, data = dt)
summary(nbr)
BIC(nbr)
logLik(nbr)


ref_grid_nbr <- ref_grid(nbr)
summary(.Last.ref_grid)
post_hoc_nbr <- emmeans(nbr, ~ Exp, contr = "tukey")

# only signs inlcuded
dat.e$Conditionnames <- as.factor(dat.e$Conditionnames)
nbr.s <- glm.nb(Rate ~ Conditionnames, data = dat.e)
summary(nbr.s)

ref_grid_nbr <- ref_grid(nbr.s)
summary(.Last.ref_grid)
post_hoc_nbr <- emmeans(nbr.s, ~ Conditionnames, contr = "tukey")


table <- round((est <- cbind(b = coef(nbr), 
                             se = coef(summary(nbr))[,2],
                             z = coef(summary(nbr))[,3], 
                             p = coef(summary(nbr))[,4],
                             eb = exp(coef(nbr)),
                             CI = exp(confint(nbr)))),3)

write.csv(table,'experiment_table.csv')

#-------------------------------------------------------------------------------------------------------
# Testing the effect of Traffic 

## Correlation between Traffic and Usage Rate 
corr.all <- cor.test(x=dat$Traffic, y=dat$Rate, method = 'pearson')
corr.all
corr.c <- cor.test(x=dat.c$Traffic, y=dat.c$Rate, method = 'pearson')
corr.c 
corr.e <- cor.test(x=dat.e$Traffic, y=dat.e$Rate, method = 'pearson')
corr.e

#Baseline
wtest.d.b <- wilcox.test(Rate ~ H.L.Traffic, data = dat.c, conf.int = TRUE) 
wtest.d.b

#Reciprocity
d.re <- subset(dat, Condition== 2)
corr.re <- cor.test(x=d.re$Traffic, y=d.re$Rate, method = 'pearson')
corr.re 
wtest.d.re <- wilcox.test(Rate ~ H.L.Traffic, data = d.re, conf.int = TRUE) 
wtest.d.re

#Consistency
d.co <- subset(dat, Condition== 4)
corr.co <- cor.test(x=d.co$Traffic, y=d.co$Rate, method = 'pearson')
corr.co
wtest.d.co <- wilcox.test(Rate ~ H.L.Traffic, data = d.co, conf.int = TRUE) 
wtest.d.co

#Unity
d.un <- subset(dat, Condition== 6)
corr.un <- cor.test(x=d.un$Traffic, y=d.un$Rate, method = 'pearson')
corr.un
wtest.d.un <- wilcox.test(Rate ~ H.L.Traffic, data = d.un, conf.int = TRUE) 
wtest.d.un

#Social Proof
d.sp <- subset(dat, Condition== 8)
corr.sp <- cor.test(x=d.sp$Traffic, y=d.sp$Rate, method = 'pearson')
corr.sp
wtest.d.sp <- wilcox.test(Rate ~ H.L.Traffic, data = d.sp, conf.int = TRUE) 
wtest.d.sp

#Liking
d.li <- subset(dat, Condition== 10)
corr.li <- cor.test(x=d.li$Traffic, y=d.li$Rate, method = 'pearson')
corr.li
wtest.d.li <- wilcox.test(Rate ~ H.L.Traffic, data = d.li, conf.int = TRUE) 
wtest.d.li

#Authority
d.au <- subset(dat, Condition== 12)
corr.au <- cor.test(x=d.au$Traffic, y=d.au$Rate, method = 'pearson')
corr.au
wtest.d.au <- wilcox.test(Rate ~ H.L.Traffic, data = d.au, conf.int = TRUE) 
wtest.d.au

#Scarcity
d.sc <- subset(dat, Condition== 14)
corr.sc <- cor.test(x=d.sc$Traffic, y=d.sc$Rate, method = 'pearson')
corr.sc
wtest.d.sc <- wilcox.test(Rate ~ H.L.Traffic, data = d.sc, conf.int = TRUE) 
wtest.d.sc

#-----------------------------------------------------------------------------------------------------
## independent 2-group Mann-Whitney U Test for social-proof against its baseline 
dsp <- subset(dat, Condition== 7 | Condition == 8)
shapiro.test(dsp$Rate) # not normal

wtest.dsp <- wilcox.test(Rate ~ Condition, data = dsp, conf.int = TRUE) 
wtest.dsp


## independent 2-group Mann-Whitney U Test for authority against its baseline
da <- subset(dat, Condition== 11 | Condition == 12)
shapiro.test(da$Rate) # not normal

wtest.da <-wilcox.test(Rate ~ Condition, data = da, conf.int = TRUE) 
wtest.da


#-------------------------------------------------------------------------------------
##Relative increases of all Experimental conditions in comparison to the baseline level

Increase_Mean <- round(tapply(dat$Increase, dat$Condition, mean), 1)

install.packages("Publish")
require(Publish)
ci.mean(d.re$Increase)
ci.mean(d.co$Increase)
ci.mean(d.un$Increase)
ci.mean(d.sp$Increase)
ci.mean(d.li$Increase)
ci.mean(d.au$Increase)
ci.mean(d.sc$Increase)

ci.mean(dat.e$Increase)

#Relative increses from predecessor week
d.au$Increase2 <- (d.au$Rate * 100 / 6.4) -100
ci.mean(d.au$Increase2)

d.sp$Increase2 <- (d.sp$Rate * 100 / 6.1) -100
ci.mean(d.sp$Increase2)


#----------------------------------------------------------------------
# Figures in perspective
b <- round((6.0 * 17580 / 100),0)
a <- round((7.7 * 17580 / 100),0)
(a - b) * 52
# 15548