# QCBS R Workshop Series
# Workshop 5 Linear Mixed Models
#Designed by Jacob Ziegler, Dalal Hanna and Catherine Baltazar

################Section 1#########################

# Remove prior commands in R
rm(list=ls()) 

# Place all workshop material in one folder on your computer
# Run the following line of code and use the browsing window to choose the QCBS_W5_Data.csv 
# file in the folder that contains the workshop material
file.choose()

# Set the working directoy to the folder which contains the lab material by copy and pasting
# all but the R file name from the output of the file.choose() command into the set working 
# directory command. 

# For example paste "/Users/ziegljac/Documents/QCBS_R/" -> include the quotations
# NOT "/Users/ziegljac/Documents/QCBS_R/Get_Data_Func.R" -> include the quotations
setwd()

# Load useful libraries and data
#Note: if you have never loaded these libraries before you will have to use 
#the "install.packages" function before the "library" function
library(ggplot2)
library(lme4)
library(arm)
library(AICcmodavg)
library(beepr)
#AICc is for small sample size corrected
#Always use this because the equation is set that the bigger the sample size gets the more it's just like AIC

data <- read.csv('qcbs_w5_data.csv')

# Used to strip down figures to make them simpler
fig <- theme_bw() + theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank(), panel.background=element_blank()) + 
  theme(strip.background=element_blank(), strip.text.y = element_text()) + theme(legend.background=element_blank()) + 
  theme(legend.key=element_blank()) + theme(panel.border = element_rect(colour="black", fill=NA))

# Make the followoing three plots to explore the data
plot <- ggplot(aes(Fish_Length,Trophic_Pos),data=data)

# Plot 1 - All Data
plot + geom_point() + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="All Data") + fig

# Plot 2 - By Speceis - BG = Bluegill, WY = Walleye, and YP = Yellow Perch
plot + geom_point() + facet_wrap(~ Fish_Species) + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="By Species") + fig

# Plot 3 - By Lake 
plot + geom_point() + facet_wrap(~ Lake) + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="By Lake") + fig

################Section 2#########################
#Running a mixed model in R
#Four Step Process to build a mixed model in R

#1) A priori model bulding and data exploration####
#i)  Map out the model based on a priori knowledge
#We know that we want to build a model that evaluates the relationship 
#bewteen trophic position and length while accounting for lake and species varition
#Trophic Position ~ Length + Species + Lake

#ii)Housekeeping and data exploration
#Ensure that the structure of your data is correct
str(data)
#Look at sample distribution across factors to check 
#if there are any major unequal distributions
table(data$Lake)
table(data$Fish_Species)
 
#Look at distribution of continuous variables
#Transform if necessary (will avoid future problems with homogeneity of model residuals)
hist(data$Fish_Length)
hist(data$Trophic_Pos)

#Check for colinearity between variables
plot(data)
cor(data$Fish_Length, data$Trophic_Pos)
#Note that in this data set there are not mulitple continuous variables 
#between which correlations might cause problems
#but if for example we have length and mass data
#we would not want to use both, as they would likely be highly correlated

#Consider the scales of your variables
#Note: when 2 variables have very different ranges of scale the criteria mixed models use to come up
#with parameter estimates are likely to return 'convergance errors'
#Z correcting adjusts for this scaling problem:
#What is a z correction?: (z = (x - mean(x))/sd(x))
#Z-correct Length
data$Z_Length<-(data$Fish_Length-mean(data$Fish_Length))/sd(data$Fish_Length)
#Z-correct Trophic Position
data$Z_TP<-(data$Trophic_Pos-mean(data$Trophic_Pos))/sd(data$Trophic_Pos)


#Find out if it is important to account for variation in "random effects"
#by comparing the residuals of a linear model without the random effects with 
#the potential random effects
lm.test<-lm(Z_TP~Z_Length, data=data)
lm.test.resid<-rstandard(lm.test)
#Species Effect
plot(lm.test.resid~ data$Fish_Species, xlab = "Species", ylab="Standardized residuals")
abline(0,0, lty=2)
#Lake Effect
plot(lm.test.resid~ data$Lake, xlab = "Lake", ylab="Standardized residuals")
abline(0,0, lty=2)

#2) Coding potential models and model selection####
#i) Coding all potential models
#List of all Potential models-->
#Note: you can chose to not code ones that do not make biological sense.
#Linear model with no random effects NOTE if you want to compare this model with the other models you must 
#Change REML = FALSE for all lmer models
M0<-lm(Z_TP~Z_Length,data=data)
#Full model with varying intercepts
M1<-lmer(Z_TP~Z_Length + (1|Fish_Species) + (1|Lake), data=data, REML=TRUE)
#Full model with varying intercepts and slopes
M2<-lmer(Z_TP~Z_Length + (1+Z_Length|Fish_Species) + (1+Z_Length|Lake), data=data, REML=TRUE)
#No Lake, varying intercepts only
M3<-lmer(Z_TP~Z_Length + (1|Fish_Species), data=data, REML=TRUE)
#No Species, varying intercepts only
M4<-lmer(Z_TP~Z_Length + (1|Lake), data=data, REML=TRUE)
#No Lake, varying intercepts and slopes
M5<-lmer(Z_TP~Z_Length + (1+Z_Length|Fish_Species), data=data, REML=TRUE)
#No Species, varying intercepts and slopes
M6<-lmer(Z_TP~Z_Length + (1+Z_Length|Lake), data=data, REML=TRUE)
#Full model with varying intercepts and slopes only varying by lake
M7<-lmer(Z_TP~Z_Length + (1|Fish_Species) + (1+Z_Length|Lake), data=data, REML=TRUE)
#Full model with varying intercepts and slopes only varying by species
M8<-lmer(Z_TP~Z_Length + (1+Z_Length|Fish_Species) + (1|Lake), data=data, REML=TRUE)


#ii) Compare models using AICc values
#Compute AICc values for each model
AICc<-c(AICc(M1), AICc(M2), AICc(M3), AICc(M4), AICc(M5), AICc(M6), AICc(M7), AICc(M8))
#Put values into one table for easy comparision
Model<-c("M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8")
AICtable<-data.frame(Model=Model, AICc=AICc)
AICtable
#M8 has the lowest AICc value so it has the most predictive power
#M2 is also a good fit, but all other models are not nearly as good.
#Note when you compare models with different fixed effects they must be fit by 
#Maximum Likelihood (ML) and not by Restricted Maximum Likelihood (REML)

#3) Checking model assumptions####
#Checking for M8
#A. Look at independence: plot fitted values vs residuals
E1 <- resid(M8)
F1<-fitted(M8)
plot(x = F1, 
     y = E1, 
     xlab = "Fitted Values",
     ylab = "Normalized residuals")
abline(h = 0, lty = 2)

#B. Look at homogeneity:
# i. plot residuals vs each covariate in the model
#Fish_Length
plot(x = data$Z_Length, 
     y = E1, 
     xlab = "Z Length",
     ylab = "Normalized residuals")
abline(h = 0, lty = 2)
#Note: observed groupings are created by the nature of the data because in the data set 
#we only measured individuals from 5 categories of lengths (big, small, and three groups in between)

#Species
boxplot(E1 ~ Fish_Species,   
        ylab = "Normalized residuals",
        data = data, xlab = "Species")
abline(h = 0, lty = 2)
#Lake
boxplot(E1 ~ Lake,   
        ylab = "Normalized residuals",
        data = data, xlab = "Lake")
abline(h = 0, lty = 2)

# ii. plot residuals vs each covariate not in the model
#NA in the case of this data set

#D. Look at normality: histogram
hist(E1)

#4) Interpreting results and visuzaling the model####
#Re-fit model by REML if you set REML = FALSE for model comparisions
#REML is a more conservative estimate, so the rule is to use this estimation method when
#obtaining model coefficients
M8<-lmer(Z_TP~Z_Length + (1+Z_Length|Fish_Species) + (1|Lake), data=data, REML=TRUE)

#Look at model summary
#This allows you to get an idea of the variance explained by the different components 
#of the model and the "significance" of fixed effects
summary(M8)

#Visualizing model results####
#There are several ways of visualizing the results of a mixed model, all of which
#involve using the coefficients generated by the model. 
#So the first step is get the model coefficients to be able to add them to the figures
coef(M8)
#Now put the coefs into dataframes to make them more easy to manipulate
Lake.coef <- as.data.frame(coef(M8)$Lake)
colnames(Lake.coef) <- c("Intercept","Slope")
Species.coef <- as.data.frame(coef(M8)$Fish_Species)
colnames(Species.coef) <- c("Intercept","Slope")

# Plot 1 - All Data
#Make a plot that includes all the data
plot <- ggplot(aes(Z_Length,Z_TP),data=data)
Plot_AllData <- plot + geom_point() + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="All Data") + fig
#Add a layer that has an abline with the intercept and slope of the relationship between length and trophic position
#Note that you can obtain the intercept and slope of the fixed factor directly from the model summary
summary(M8)
Plot_AllData + geom_abline(intercept = -0.0009059, slope =0.4222697)

# Plot 2 - By Speceis 
#Plot the data color coded by Species
Plot_BySpecies<-plot + geom_point(aes(colour = factor(Fish_Species)), size = 4) + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="By Species") + fig
#Add the regression line with the intercepts and slopes specific to each species
Plot_BySpecies + geom_abline(intercept = Species.coef[1,1], slope =Species.coef[1,2], colour="coral2") + geom_abline(intercept = Species.coef[2,1], slope =Species.coef[2,2], colour = "green4") + geom_abline(intercept = Species.coef[3,1], slope =Species.coef[3,2], colour="blue1")

# Plot 3 - By Lake 
#Plot the data color coded by lake
Plot_ByLake<-plot + geom_point(aes(colour = factor(Lake)), size = 4) + xlab("Length (mm)") + ylab("Trophic Position") + labs(title="By Lake") + fig
#Add in regression lines with the intercepts specific to each lake
Plot_ByLake + geom_abline(intercept = Lake.coef[1,1], slope =Lake.coef[1,2], colour="coral2") + geom_abline(intercept = Lake.coef[2,1], slope =Lake.coef[2,2], colour="khaki4") + geom_abline(intercept = Lake.coef[3,1], slope =Lake.coef[3,2], colour="green4") + geom_abline(intercept = Lake.coef[4,1], slope =Lake.coef[4,2], colour="darkgoldenrod") + geom_abline(intercept = Lake.coef[5,1], slope =Lake.coef[5,2], colour="royalblue1") + geom_abline(intercept = Lake.coef[6,1], slope =Lake.coef[6,2], colour="magenta3")

beep(sound = 4)
beep(sound = 3)
#Thanks for attending the workshop and/or checking out this code!
#We hope that this has been helpful to you!