## QCBS R workshops ## Workshop 6 - GLMs and GLMMs ## Authors: Cédric Frenette-Dussault, Vincent Fugère, Thomas Lamy, Zofia Taranu ## Date: November 2014 ## R version 3.0.2 #### Loading data #### setwd("~/Desktop") mites <- read.csv('mites.csv') head(mites) str(mites) # 70 mite communities sampled from moss cores collected at the Station de Biologie des Laurentides, QC. # For each core/sample, the following data is provided: # $Galumna: abundance of mite genus 'Galumna' # $pa: presence (1) or absence (0) of Galumna, irrespective of abundance # $totalabund: total abundance of mites of all species # $prop: proportion of Galumna in the mite community. i.e. Galumna/totalabund # $SubsDens: substrate density # $WatrCont: water content # $Substrate: type of substrate. # $Shrub: abundance of shrubs. Coded as a factor. # $Topo: microtopography. blanket or hummock. #### Limitations of linear models #### # Research question: do the abundance, occurrence (presence/absence), and # proportion of Galumna vary as a function of the 5 environmental variables? # Can any relationships be seen graphically? plot(mites) # Galumna abundance, occurrence, and proportion seems to vary with WatrCont: par(mfrow=c(1,3)) #divide plot area in 1 row and 3 columns to have 3 plots in same figure (click zoom to see better) plot(Galumna ~ WatrCont, data = mites, xlab = 'Water content', ylab='Abundance') boxplot(WatrCont ~ pa, data = mites, xlab='Presence/Absence', ylab = 'Water content') plot(prop ~ WatrCont, data = mites, xlab = 'Water content', ylab='Proportion') par(mfrow=c(1,1)) #resets to default plot settings for future plots # Galumna prefers dryer sites? # Can we test that relationship statistically? lm.abund <- lm(Galumna ~ WatrCont, data = mites) summary(lm.abund) lm.pa <- lm(pa ~ WatrCont, data = mites) summary(lm.pa) lm.prop <- lm(prop ~ WatrCont, data = mites) summary(lm.prop) # all significant, great! Wait a minute... #Lets validate and plot the results of these models: plot(Galumna ~ WatrCont, data = mites) abline(lm.abund) #does not fit well: predicts negative abundance at WatrCont > 600, and can't predict high abundance at low WatrCont plot(lm.abund) # not great. Data is non-normal, variance is heterogeneous. # same problem with prop plot(prop ~ WatrCont, data = mites) abline(lm.prop) plot(lm.prop) #even worse for presence/absence... plot(pa ~ WatrCont, data = mites) abline(lm.pa) plot(lm.pa) # #Code to generate figure that shows the parameters of a normal distribution # par(mfrow=c(1,2)) # plot(0:50, dnorm(0:50,20,5), type='l', lwd=1, xlab='#galumna', ylab='probability',cex.lab=0.8,cex.axis=0.8) # lines(0:50, dnorm(0:50,25,5), lwd=1, col='red') # lines(0:50, dnorm(0:50,30,5), lwd=1, col='blue') # legend(-3,0.08,legend = c(expression(italic(mu) == 20),expression(italic(mu) == 25),expression(italic(mu) == 30)), bty='n', text.col=c('black','red','blue'), cex=0.7) # mtext(side=3,line=.5,cex=0.7,expression("varying"~italic(mu)*","~~italic(sigma) == 5)) # plot(0:50, dnorm(0:50,25,5), type='l', lwd=1, xlab='#galumna', ylab='probability',cex.lab=0.8,cex.axis=0.8) # lines(0:50, dnorm(0:50,25,7.5), lwd=1, col='red') # lines(0:50, dnorm(0:50,25,10), lwd=1, col='blue') # legend(-3,0.08,legend = c(expression(italic(sigma) == 5),expression(italic(sigma) == 7.5),expression(italic(sigma) == 10)), bty='n', text.col=c('black','red','blue'), cex=0.7) # mtext(side=3,line=.5,cex=0.7,expression(italic(mu) ==25*","~~"varying"~italic(sigma))) #coefficient of galumna abundance lm coef(lm.abund) # #Code to generate figure that illustrates assumptions of linear models # plot.norm<-function(x,mymodel,mult=1,sd.mult=3,mycol='LightSalmon',howmany=150) { # yvar<-mymodel$model[,1] # xvar<-mymodel$model[,2] # sigma<-summary(mymodel)$sigma # stick.val<-rep(xvar[x],howmany)+mult*dnorm(seq(predict(mymodel)[x]-sd.mult*sigma, predict(mymodel)[x]+sd.mult*sigma, length=howmany), mean=predict(mymodel)[x],sd=sigma) # steps<-seq(predict(mymodel)[x]-sd.mult*sigma,predict(mymodel)[x]+sd.mult*sigma,length=howmany) # polygon(c(stick.val,rep(xvar[x],howmany)),c(sort(steps,decreasing=T),steps),col=mycol,border=NA) # } # #function adapted from http://www.unc.edu/courses/2010fall/ecol/563/001/notes/lecture4%20Rcode.txt # plot(Galumna ~ WatrCont, data = mites,ylim=c(-4,8),cex.axis=1,cex.lab=1,type='n') # plot.norm(8,lm.abund,200) # plot.norm(11,lm.abund,200) # plot.norm(36,lm.abund,200) # plot.norm(52,lm.abund,200) # abline(h=0,lty=3) # points(Galumna ~ WatrCont, data = mites,pch=21) # abline(lm.abund,lty=1) # abline(v=mites$WatrCont[c(8,11,36,52)],col='red',lty=2) # text(x = mites$WatrCont[8]+50,y=7.5,expression(mu == 1.8),cex=1,col='red') # text(x = mites$WatrCont[11]+50,y=7.5,expression(mu == 2.6),cex=1,col='red') # text(x = mites$WatrCont[36]+50,y=7.5,expression(mu == 0.9),cex=1,col='red') # text(x = mites$WatrCont[52]+60,y=7.5,expression(mu == -0.1),cex=1,col='red') # text(x = mites$WatrCont[52]+105,y=6.5,expression(sigma == 'always' ~ 1.51),cex=1,col='red') #calculating sigma for lm.abund: summary(lm.abund)$sigma #### Distributions #### # how is our data distributed? hist(mites$Galumna) #integers not continuous variable. mean(mites$Galumna) hist(mites$pa) #zeros and ones sum(mites$pa) / nrow(mites) # #Code to generate 3 figures that illustrate bernouilli, binomial, and poisson distributions # par(mfrow=c(1,3), lend=3) # plot(0:1, dbinom(0:1,1,.1), xlim=c(-0.5,1.5), type='h', lwd=15, xaxt='n', xlab='p/a', ylab='probability') # axis(1,at=c(0,1),labels=c('absent(0)','present(1)'),tick=F) # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.1)) # plot(0:1, dbinom(0:1,1,.5), xlim=c(-0.5,1.5), type='h', lwd=15, xaxt='n', xlab='p/a', ylab='probability') # axis(1,at=c(0,1),labels=c('absent(0)','present(1)'),tick=F) # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.5)) # plot(0:1, dbinom(0:1,1,.9), xlim=c(-0.5,1.5), type='h', lwd=15, xaxt='n', xlab='p/a', ylab='probability') # axis(1,at=c(0,1),labels=c('absent(0)','present(1)'),tick=F) # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.9)) # par(mfrow=c(1,1)) # # # par(mfrow=c(1,3), lend=3) # plot(0:50, dbinom(0:50,50,.1), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.1*","~~italic(n) == 50)) # plot(0:50, dbinom(0:50,50,.5), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.5*","~~italic(n) == 50)) # plot(0:50, dbinom(0:50,50,.9), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5,cex=0.7,expression(italic(p) ==.9*","~~italic(n) == 50)) # par(mfrow=c(1,1)) # # par(mfrow=c(1,3), lend=3) # plot(0:50, dpois(0:50,1), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5, cex=0.7, expression(lambda==1)) # plot(0:50, dpois(0:50,10), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5, cex=0.7, expression(lambda==10)) # plot(0:50, dpois(0:50,30), type='h', lwd=1, xlab='#galumna', ylab='probability') # mtext(side=3,line=.5, cex=0.7, expression(lambda==30)) # par(mfrow=c(1,1)) # # #Code to generate figure that illustrates a poisson glm # glm.pois <- glm(Galumna ~ WatrCont, data = mites, family='poisson') # plot.poiss<-function(x,mymodel,mult=1,mycol='LightSalmon') { # yvar<-mymodel$model[,1] # xvar<-mymodel$model[,2] # lambd<-mymodel$fitted[x] # stick.val<-rep(xvar[x],9)+mult*dpois(0:8,lambd=lambd) # segments(rep(xvar[x],9),0:8,stick.val,0:8,col=mycol,lwd=3) # } # plot(Galumna ~ WatrCont, data = mites,cex.axis=0.7,cex.lab=0.7) # points(Galumna ~ WatrCont, data = mites,pch=21) # lines(x=seq(min(mites$WatrCont),max(mites$WatrCont),by=1),y=predict(glm.pois,newdata=data.frame('WatrCont' = seq(min(mites$WatrCont),max(mites$WatrCont),by=1)),type='response')) # par(lend=3) # plot.poiss(8,glm.pois,200) # abline(v=mites$WatrCont[8],col='red',lty=2) # plot.poiss(11,glm.pois,200) # abline(v=mites$WatrCont[11],col='red',lty=2) # plot.poiss(36,glm.pois,200) # abline(v=mites$WatrCont[36],col='red',lty=2) # plot.poiss(52,glm.pois,200) # abline(v=mites$WatrCont[52],col='red',lty=2) # text(x = mites$WatrCont[8]+50,y=7.5,expression(lambda == 1.7),cex=0.7,col='red') # text(x = mites$WatrCont[11]+50,y=7.5,expression(lambda == 4.7),cex=0.7,col='red') # text(x = mites$WatrCont[36]+50,y=7.5,expression(lambda == 0.5),cex=0.7,col='red') # text(x = mites$WatrCont[52]+50,y=7.5,expression(lambda == 0.1),cex=0.7,col='red') #### A GLM with binary variables #### ## Inappropriate use of a linear model with a binary response variable ## model.lm <- lm(pa ~ WatrCont + Topo, data = mites) fitted(model.lm) # The "fitted()" function gives us expected values for the response variable. # Some values are lower than 0, which does not make sense for a logistic regression. # Let’s try the same model with a binomial distribution instead. # Notice the "family" argument to specify the distribution. model.glm <- glm(pa ~ WatrCont + Topo, data = mites, family = binomial) fitted(model.glm) # All values are bound between 0 and 1. ## The concept of the linear predictor ## # Load the CO2 dataset. We used it during workshop 3! data(CO2) head(CO2) # Build a linear model of plant CO2 uptake as a function of CO2 ambient concentration model.CO2 <- lm(uptake ~ conc, data = CO2) # Extract the design matrix of the model with the model.matrix() function. X <- model.matrix(model.CO2) # And the estimated coefficients. B <- model.CO2$coefficients # Let’s multiply both X and B matrices to obtain the linear predictor. # The "%*%" symbol indicates that it is a matrix product. XB <- X %*% B # Compare the values of XB to the values obtained with the predict() function. # All statements should be TRUE. # We use the round() function so that all elements have 5 digits. round(fitted(model.CO2), digits = 5) == round(XB, digits = 5) ## A simple example of logistic regression ## # Let’s build a regression model of the presence/absence of a mite species (Galumna sp.) # as a function of water content and topography. # To do this, we need to use the glm() function and specify the family argument. logit.reg <- glm(pa ~ WatrCont + Topo, data = mites, family = binomial(link = "logit")) # The logit function is the default for the binomial distribution, # so it is not necessary to include it in the "family" argument: logit.reg <- glm(pa ~ WatrCont + Topo, data = mites, family = binomial) summary(logit.reg) ## Interpreting the output of a logistic regression ## # Obtaining the odds of the slope. # Use the "exp()" function to put the coefficients back on the odds scale. # Mathematically, this line of code corresponds to: # exp(model coefficients) = exp(log(μ / (1 - μ)) = u / (1 - μ) # This corresponds to an odds ratio! exp(logit.reg$coefficients[2:3]) # WatrCont TopoHummock # 0.9843118 8.0910340 # To obtain confidence intervals on the odds scale: exp(confint(logit.reg)[2:3,]) # 2.5 % 97.5 % # WatrCont 0.9741887 0.9919435 # TopoHummock 2.0460547 38.6419693 ## Some extra math about the inverse logic ## # Let's start with our odds ratio for topography in our logit.reg model: # µ/ (1 - µ) = 8.09 # Let's rearrange this to isolate µ # µ = 8.09(1 - µ) = 8.09 - 8.09µ # 8.09µ + µ = 8.09 # µ(8.09 + 1) = 8.09 # µ = 8.09 / (8.09 + 1) # µ = 1 / (1 + (1 / 8.09)) = 0.89 # We obtained the same result without using the exp() function! ## How to compute a pseudo-R2 for a GLM ## # Residual and null deviances are already stored in the glm object. objects(logit.reg) pseudoR2 <- (logit.reg$null.deviance - logit.reg$deviance) / logit.reg$null.deviance pseudoR2 # [1] 0.4655937 ## The coefficient of discrimination and its visualisation ## install.packages("binomTools") library("binomTools") # The Rsq function computes several fit indices, # including the coefficient of discrimination. # For information on the other fit indices, see Tjur (2009). # The plot shows the distribution of expected values when the outcome is observed # and not observed. # Ideally, the overlap between the two histograms should be small. fit <- Rsq(object = logit.reg) fit # R-square measures and the coefficient of discrimination, 'D': # # R2mod R2res R2cor D # 0.5205221 0.5024101 0.5025676 0.5114661 # # Number of binomial observations: 70 # Number of binary observation: 70 # Average group size: 1 plot(fit, which = "hist") ## The Hosmer-Lemeshow test ## fit <- Rsq(object = logit.reg) HLtest(object = fit) # The p value is 0.9051814. Hence, we do not reject the model. # We can consider it as appropriate for the data. ## Plotting the results of a logistic regression ## library(ggplot2) ggplot(mites, aes(x = WatrCont, y = pa)) + geom_point() + stat_smooth(method = "glm", family= "binomial", se = FALSE) + xlab("Water content") + ylab("Probability of presence") + ggtitle("Probability of presence of Galumna sp. as a function of water content") ## An example with proportion data ## # Let’s generate some data based on the deer example: # We randomly choose a number between 1 and 10 for the number of infected deer. # Ten deers were sampled in ten populations. # Resource availability is an index to characterise the habitat. set.seed(123) n.infected <- sample(x = 1:10, size = 10, replace = TRUE) n.total <- rep(x = 10, times = 10) res.avail <- rnorm(n = 10, mean = 10, sd = 1) # Next, let’s build the model. Notice how the proportion data is specified. # We have to specify the number of cases where disease was detected # and the number of cases where the disease was not detected. prop.reg <- glm(cbind(n.infected, n.total - n.infected) ~ res.avail, family = binomial) summary(prop.reg) # If your data is directly transformed into proportions, here is the way to do it in R: # Let's first create a vector of proportions prop.infected <- n.infected / n.total # We have to specify the "weights" argument in the glm function to indicate the number of trials per site prop.reg2 <- glm(prop.infected ~ res.avail, family = binomial, weights = n.total) summary(prop.reg2) # The summaries of both prop.reg and prop.reg2 are identical! #### GLMs with count data #### # load the faramea dataset faramea <- read.csv('faramea.csv', header = TRUE) # let's look at the data str(faramea) # plot the histogram of the number of Faramea occidentalis and plot its relationship with elevation par(mfrow=c(1,2)) hist(faramea$Faramea.occidentalis, breaks=seq(0,45,1), xlab=expression(paste("Number of ", italic(Faramea~occidentalis))), ylab="Frequency", main="", col="grey") plot(faramea$Elevation, faramea$Faramea.occidentalis, xlab="Elevation (m)", ylab=expression(paste("Number of", " ", italic(Faramea~occidentalis))), pch=1, col="black") # a Poisson GLM (a simple Poisson regression) seems to be a good choice # to model the number of Faramea occidentalis as a function of elevation glm.poisson <- glm(Faramea.occidentalis~Elevation, data=faramea, family=poisson) summary(glm.poisson) # The parameter estimates of the model can also be extracted as follow # intercept summary(glm.poisson)$coefficients[1,1] # slope of elevation summary(glm.poisson)$coefficients[2,1] # the summary suggest that evelation has a negative effect on the abundance of Faramea occidentalis # this can also be evaluated based on deviance null.model <- glm(Faramea.occidentalis~1, data=faramea, family=poisson) anova(null.model, glm.poisson, test="Chisq") # gain 26.69 deviance by including elevation at the cost of 1 df # the difference in deviance approximately follows a Chi-square distribution # with 1 degree of freedom 1-pchisq(26.686, 1) # Though significant, does the model provide a good fit to the data? # remember that the Poisson distribution assumes mean = variance # an inspection of the data can provide a good idea whether the model will correclty fit the data mean(faramea$Faramea.occidentalis) # mean abundance of F. occidentalis is 3.88 individuals per quadrat var(faramea$Faramea.occidentalis) # Variance is the data is extremly high compare to its mean # ========== Facultative # what does it mean? # let's generate values of abundance according to a Poisson distribution of mean 3.88 y.sim <- dpois(0:50, mean(faramea$Faramea.occidentalis)) # the following plot describes the expected distribution of F. occidentalis given that elevation has no effect # this is the distribution of the null model - all observations have the same mean) plot(0:50, y.sim*dim(faramea)[1], type = "h", xlab = expression(paste("Number of ", italic(Faramea~occidentalis))), ylab = "Frequency", main = "Poisson distribution with parameter 3.88") # ========== Facultative # In addition, the residual deviance is 388.12 for 41 degrees of freedom # A Poisson distribution assumes residual deviance equals residual degrees of freedom # When residual deviance is much greater than residual degrees of freedom we say # the model is overdispersed! # Controling for overdispersion using a quasi-Poisson distribution # fit a quasi-Poisson GLM to the data glm.quasipoisson = glm(Faramea.occidentalis~Elevation, data=faramea, family=quasipoisson) # or glm.quasipoisson = update(glm.poisson,family=quasipoisson) summary(glm.quasipoisson) # Elevation is no longer significant! null.model <- glm(Faramea.occidentalis~1, data=faramea, family=quasipoisson) anova(null.model, glm.quasipoisson, test="Chisq") # This model is probably better, but that's a lot of overdispersion (Phi = 15.969 >> 1) # In cases of extreme overdispersion (i.e. Phi>15-20) negative binomial works better # NB is not in the glm() function so you need to install and charge the MASS library. # You do not remember if you already installed this package? No problem, you can use the following function: ifelse(length(which(installed.packages() == "MASS")) == 0, {print("MASS not installed. Installing... "); install.packages("MASS")}, print("MASS already installed")) # Alternatively, if you know that this package is not installed you can directly use the command install.packages("MASS") library("MASS") # fit a negative binomial GLM to the data glm.negbin = glm.nb(Faramea.occidentalis~Elevation, data=faramea) summary(glm.negbin) # k = 0.259, and Elevation is significant # our coefficient are # intecetp summary(glm.negbin)$coefficients[1,1] # slope of elevation summary(glm.negbin)$coefficients[2,1] # plot the model # we plot the data plot(faramea$Elevation, faramea$Faramea.occidentalis, xlab="Elevation (m)", ylab=expression(paste("Number of", " ", italic(Faramea~occidentalis))), pch=16, col=rgb(4,139,154,150,maxColorValue=255)) # use values for alpha and beta of the summary and put them in the exponential equation curve(exp(summary(glm.negbin)$coefficients[1,1]+summary(glm.negbin)$coefficients[2,1]*x),from=range(faramea$Elevation)[1],to=range(faramea$Elevation)[2],add=T, lwd=2, col="orangered") # use the standard error of the summary to draw the confidence envelope curve(exp(summary(glm.negbin)$coefficients[1,1]+1.96*summary(glm.negbin)$coefficients[1,2]+summary(glm.negbin)$coefficients[2,1]*x+1.96*summary(glm.negbin)$coefficients[2,2]),from=range(faramea$Elevation)[1],to=range(faramea$Elevation)[2],add=T,lty=2, col="orangered") curve(exp(summary(glm.negbin)$coefficients[1,1]-1.96*summary(glm.negbin)$coefficients[1,2]+summary(glm.negbin)$coefficients[2,1]*x-1.96*summary(glm.negbin)$coefficients[2,2]),from=range(faramea$Elevation)[1],to=range(faramea$Elevation)[2],add=T,lty=2, col="orangered") #### Challenge # use the mite data # mites <- read.csv("mites.csv", header = TRUE) # let's look at the data par(mfrow=c(2,2)) hist(mites$Galumna, breaks=c(0:10), xlab=expression(paste("Number of ", italic(Galumna~sp))), ylab="Frequency", main="", col="grey") plot(mites$SubsDens, mites$Galumna, xlab="Substrate density (g/L)", ylab=expression(paste("Number of", " ", italic(Galumna~sp))), pch=1, col="black") plot(mites$WatrCont, mites$Galumna, xlab="Water content of the substrate (g/L)", ylab=expression(paste("Number of", " ", italic(Galumna~sp))), pch=1, col="black") # Poisson GLM glm.p = glm(Galumna~WatrCont+SubsDens, data=mites, family=poisson) summary(glm.p) # quasi-Poisson GLM glm.qp = update(glm.p,family=quasipoisson) summary(glm.qp) # model selection drop1(glm.qp, test = "Chi") glm.qp2 = glm(Galumna~WatrCont, data=mites, family=quasipoisson) anova(glm.qp2, glm.qp, test="Chisq") #### GLMM #### # First load and view the dataset dat.tf <- read.csv("Banta_TotalFruits.csv") str(dat.tf) # 'data.frame': 625 obs. of 9 variables: # $ X : int 1 2 3 4 5 6 7 8 9 10 ... # $ reg : Factor w/ 3 levels "NL","SP","SW": 1 1 1 1 1 1 1 1 1 1 ... # $ popu : Factor w/ 9 levels "1.SP","1.SW",..: 4 4 4 4 4 4 4 4 4 4 ... # $ gen : int 4 4 4 4 4 4 4 4 4 5 ... # $ rack : int 2 1 1 2 2 2 2 1 2 1 ... # $ nutrient : int 1 1 1 1 8 1 1 1 8 1 ... # $ amd : Factor w/ 2 levels "clipped","unclipped": 1 1 1 1 1 2 1 1 2 1 ... # $ status : Factor w/ 3 levels "Normal","Petri.Plate",..: 3 2 1 1 3 2 1 1 1 2 ... # $ total.fruits: int 0 0 0 0 0 0 0 3 2 0 ... # 2-3 genotypes nested within each of the 9 populations table(dat.tf$popu,dat.tf$gen) # Housekeeping: make integers into factors, relevel clipping (amd) and rename nutrient levels dat.tf <- transform(dat.tf, X=factor(X), gen=factor(gen), rack=factor(rack), amd=factor(amd,levels=c("unclipped","clipped")), nutrient=factor(nutrient,label=c("Low","High"))) # Install/ load packages if(!require(lme4)){install.packages("lme4")} require(lme4) if(!require(coefplot2)){install.packages("coefplot2",repos="http://www.math.mcmaster.ca/bolker/R",type="source")} require(coefplot2) if(!require(reshape)){install.packages("reshape")} require(reshape) if(!require(ggplot2)){install.packages("ggplot2")} require(ggplot2) if(!require(plyr)){install.packages("plyr")} require(plyr) if(!require(gridExtra)){install.packages("gridExtra")} require(gridExtra) if(!require(emdbook)){install.packages("emdbook")} require(emdbook) if(!require(bbmle)){install.packages("bbmle")} require(bbmle) source("glmm_funs.R") # Structure in dataset: Response vs fixed effects ggplot(dat.tf,aes(x=amd,y=log(total.fruits+1),colour=nutrient)) + geom_point() + ## need to use as.numeric(amd) to get lines stat_summary(aes(x=as.numeric(amd)),fun.y=mean,geom="line") + theme_bw() + theme(panel.margin=unit(0,"lines")) + scale_color_manual(values=c("#3B9AB2","#F21A00")) + # from Wes Anderson Zissou palette facet_wrap(~popu) ggplot(dat.tf,aes(x=amd,y=log(total.fruits+1),colour=nutrient)) + geom_point() + ## need to use as.numeric(amd) to get lines stat_summary(aes(x=as.numeric(amd)),fun.y=mean,geom="line") + theme_bw() + theme(panel.margin=unit(0,"lines")) + scale_color_manual(values=c("#3B9AB2","#F21A00")) + # from Wes Anderson Zissou palette facet_wrap(~gen) # Exploring heterogeneity across groups # Create new variables that represents every combination nutrient x clipping x random factor dat.tf <- within(dat.tf, { # genotype x nutrient x clipping gna <- interaction(gen,nutrient,amd) gna <- reorder(gna, total.fruits, mean) # population x nutrient x clipping pna <- interaction(popu,nutrient,amd) pna <- reorder(pna, total.fruits, mean) }) # Boxplot of total fruits vs new variable (genotype x nutrient x clipping) ggplot(data = dat.tf, aes(factor(x = gna),y = log(total.fruits + 1))) + geom_boxplot(colour = "skyblue2", outlier.shape = 21, outlier.colour = "skyblue2") + theme_bw() + theme(axis.text.x=element_text(angle=90)) + stat_summary(fun.y=mean, geom="point", colour = "red") # Boxplot of total fruits vs new variable (population x nutrient x clipping) ggplot(data = dat.tf, aes(factor(x = pna),y = log(total.fruits + 1))) + geom_boxplot(colour = "skyblue2", outlier.shape = 21, outlier.colour = "skyblue2") + theme_bw() + theme(axis.text.x=element_text(angle=90)) + stat_summary(fun.y=mean, geom="point", colour = "red") # Substantial variation among the sample variances on the transformed data # For example, among genotypes: grpVars <- tapply(dat.tf$total.fruits, dat.tf$gna, var) summary(grpVars) grpMeans <- tapply(dat.tf$total.fruits,dat.tf$gna, mean) summary(grpMeans) # Quasi-Poisson lm1 <- lm(grpVars~grpMeans-1) phi.fit <- coef(lm1) # The -1 specifies a model with the intercept set to zero # Negative binomial lm2 <- lm(grpVars ~ I(grpMeans^2) + offset(grpMeans)-1) k.fit <- 1/coef(lm2) # The offset() is used to specify that we want the group means added as a term with its coefficient fixed to 1 # Non-parametric loess fit Lfit <- loess(grpVars~grpMeans) plot(grpVars ~ grpMeans, xlab="group means", ylab="group variances" ) abline(a=0,b=1, lty=2) text(105,500,"Poisson") curve(phi.fit*x, col=2,add=TRUE) # bquote() is used to substitute numeric values in equations with symbols text(110,3900, bquote(paste("QP: ",sigma^2==.(round(phi.fit,1))*mu)),col=2) curve(x*(1+x/k.fit),col=4,add=TRUE) text(104,7200,paste("NB: k=",round(k.fit,1),sep=""),col=4) mvec <- 0:120 lines(mvec,predict(Lfit,mvec),col=5) text(118,2000,"loess",col=5) # Same graph but with ggplot ggplot(data.frame(grpMeans,grpVars), aes(x=grpMeans,y=grpVars)) + geom_point() + geom_smooth(colour="blue",fill="blue") + geom_smooth(method="lm",formula=y~x-1,colour="red",fill="red") + geom_smooth(method="lm",formula=y~I(x^2)+offset(x)-1, colour="purple",fill="purple") ### Poisson GLMM #### # Start with all fixed effects in and random intercepts for popu and gen mp1 <- glmer(total.fruits ~ nutrient*amd + rack + status + (1|popu)+ (1|gen), data=dat.tf, family="poisson") # Overdispersion? overdisp_fun(mp1) # Or as above, we can approximate this by dividing the deviance and deviance df summary(mp1) # deviance = 18253.7 and df.resid = 616 # mp1.df.resid <- as.numeric(summary(mp1)$AICtab["df.resid"]) # deviance(mp1)/mp1.df.resid ### NB GLMM #### mnb1 <- glmer.nb(total.fruits ~ nutrient*amd + rack + status + (1|popu)+ (1|gen), data=dat.tf, control=glmerControl(optimizer="bobyqa")) # Overdispersion? overdisp_fun(mnb1) # Much better but still not significant! #### Poisson-lognormal #### mpl1 <- glmer(total.fruits ~ nutrient*amd + rack + status + (1|X) + (1|popu)+ (1|gen), data=dat.tf, family="poisson", control=glmerControl(optimizer="bobyqa")) overdisp_fun(mpl1) # Evaluating the random intercepts summary(mpl1) summary(mpl1)$varcor mpl1.popu <- glmer(total.fruits ~ nutrient*amd + rack + status + (1|X) + (1|popu), # popu only (drop gen) data=dat.tf, family="poisson", control=glmerControl(optimizer="bobyqa")) anova(mpl1,mpl1.popu) mpl1.gen <-glmer(total.fruits ~ nutrient*amd + rack + status + (1|X) + (1|gen), # gen only (drop popu) data=dat.tf, family="poisson", control=glmerControl(optimizer="bobyqa")) anova(mpl1,mpl1.gen) # Diagnostic plots locscaleplot(mpl1,col=ifelse(dat.tf$total.fruits==0,"blue","black")) # Plotting variance terms coefplot2(mpl1,ptype="vcov",intercept=TRUE,main="Random effect variance") # Plot of fixed effects coefplot2(mpl1,intercept=TRUE,main="Fixed effect coefficient") # Plotting random intercepts pp <- list(layout.widths=list(left.padding=0, right.padding=0), layout.heights=list(top.padding=0, bottom.padding=0)) r2 <- ranef(mpl1,condVar=TRUE) d2 <- dotplot(r2, par.settings=pp) grid.arrange(d2$gen,d2$popu,nrow=1) # Longnormal-Poisson with random slopes for popu and amd mpl2 <- glmer(total.fruits ~ nutrient*amd + rack + status + (1|X) + (amd|popu) + (amd|gen), data=dat.tf, family="poisson", control=glmerControl(optimizer="bobyqa")) # View variance-covariance components summary(mpl2) # option 1 attr(VarCorr(mpl2)$gen,"correlation") # option 2 printvc(mpl2) # option 2 # Evaluating fixed effects (dd_LRT <- drop1(mpl1,test="Chisq")) (dd_AIC <- dfun(drop1(mpl1))) # Drop the interaction between clipping and nutrients mpl2 <- update(mpl1, . ~ . - amd:nutrient) anova(mpl1,mpl2) (dd_LRT <- drop1(mpl2,test="Chisq")) (dd_AIC <- dfun(drop1(mpl2))) summary(mpl2) ### GLMM Challenge: Solution #### inverts <- read.csv("inverts.csv") str(inverts) # Create new variables that represents every combination nutrient x clipping x random factor inverts <- within(inverts, { # taxon x feeding.type tft <- interaction(taxon,feeding.type,temp) tft <- reorder(tft, PLD, mean) }) # Boxplot of total fruits vs new variable (genotype x nutrient x clipping) ggplot(data = inverts, aes(factor(x = tft),y = log(PLD))) + geom_boxplot(colour = "skyblue2", outlier.shape = 21, outlier.colour = "skyblue2") + theme_bw() + theme(axis.text.x=element_text(angle=90)) + stat_summary(fun.y=mean, geom="point", colour = "red") # Substantial variation among the sample variances on the transformed data # For example, among genotypes: grpVars <- tapply(inverts$PLD, inverts$tft, var) summary(grpVars) grpMeans <- tapply(inverts$PLD,inverts$tft, mean) summary(grpMeans) # Quasi-Poisson lm1 <- lm(grpVars~grpMeans-1) phi.fit <- coef(lm1) # The -1 specifies a model with the intercept set to zero # Negative binomial lm2 <- lm(grpVars ~ I(grpMeans^2) + offset(grpMeans)-1) k.fit <- 1/coef(lm2) # The offset() is used to specify that we want the group means added as a term with its coefficient fixed to 1 # Non-parametric loess fit Lfit <- loess(grpVars~grpMeans) plot(grpVars ~ grpMeans, xlab="group means", ylab="group variances" ) abline(a=0,b=1, lty=2) text(60,200,"Poisson") curve(phi.fit*x, col=2,add=TRUE) # bquote() is used to substitute numeric values in equations with symbols text(60,800, bquote(paste("QP: ",sigma^2==.(round(phi.fit,1))*mu)),col=2) curve(x*(1+x/k.fit),col=4,add=TRUE) text(60,1600,paste("NB: k=",round(k.fit,1),sep=""),col=4) mvec <- 0:120 lines(mvec,predict(Lfit,mvec),col=5) text(50,1300,"loess",col=5) # Response vs fixed effects ggplot(inverts,aes(x=temp,y=log(PLD+1),colour=feeding.type)) + geom_point() + stat_summary(aes(x=as.numeric(temp)),fun.y=mean,geom="line") + theme_bw() + scale_color_manual(values=c("#3B9AB2","#F21A00")) + # from Wes Anderson Zissou palette facet_wrap(~taxon) # Poisson GLMM mp1 <- glmer(PLD ~ temp*feeding.type + (1|taxon), data=inverts, family="poisson") overdisp_fun(mp1) # NB GLMM mnb1 <- glmer.nb(PLD ~ temp*feeding.type + (1|taxon), data=inverts) overdisp_fun(mnb1) # Looks good! # Evaluating random intercepts summary(mnb1)$varcor mnb1.taxless <- glm.nb(PLD ~ temp*feeding.type, data=inverts) # Here, because we are comparing a glmer with a glm, we must do something different than calling anova() # to test importance of random intercept. We will compare the likelihood of each model: NL1 <- -logLik(mnb1) NL0 <- -logLik(mnb1.taxless) devdiff <- 2*(NL0-NL1) dfdiff <- attr(NL1,"df")-attr(NL0,"df") pchisq(devdiff,dfdiff,lower.tail=FALSE) # Could also look at dAIC to compare model with (mnb1) and without (mnb1.taxless) random effects # Using AICtab() function AICtab(mnb1,mnb1.taxless) # Diagnostic plots locscaleplot(mnb1) # Plotting variance terms coefplot2(mnb1,ptype="vcov",intercept=TRUE,intercept=TRUE,main="Random effect variance") # Plot of fixed effects coefplot2(mnb1,intercept=TRUE,main="Fixed effect coefficient") # Plotting random intercepts pp <- list(layout.widths=list(left.padding=0, right.padding=0)) r2 <- ranef(mnb1,condVar=TRUE) d2 <- dotplot(r2, par.settings=pp) grid.arrange(d2$taxon,nrow=1) # Longnormal-Poisson with random slopes for popu and amd mnb2 <- glmer.nb(PLD ~ temp*feeding.type + (PLD|taxon), data=inverts) # View variance-covariance components summary(mnb2) # option 1 attr(VarCorr(mnb2)$taxon,"correlation") # option 2 printvc(mnb2) # option 2 # Evaluating fixed effects # Note: to run the drop1 we need to speficy the theta parameter and run the NB model with glmer: theta.mnb1 <- theta.md(inverts$PLD, fitted(mnb1), dfr = df.residual(mnb1)) mnb1 <- glmer(PLD ~ temp*feeding.type + (1|taxon), data=inverts, family=negative.binomial(theta=theta.mnb1)) (dd_LRT <- drop1(mnb1,test="Chisq") (dd_AIC <- dfun(drop1(mnb1))) # dAIC when feeding.type x temp interaction is dropped is greater than 2