#######################################################################

## QCBS R Workshop Series ##

## GGPLOT // RESHAPE // PLYR ##

  ## Author: Quebec Center for Biodiversity Science
  ## Materials Generated & Amalgamated by: Amanda Winegardner, Xavier Giroux-Bougard, Maxwell Farrell, Etienne Low-Decarie
  ## Last updated: July 29th 2014
  ## Built under R version 3.1.0 


#### 0. Housekeeping ####

# Clean up your current working directory
rm(list=ls())

# Install and/or load required packages

if(!require(ggplot2)){install.packages("ggplot2")}
require(ggplot2)

if(!require(reshape2)){install.packages("reshape2")}
require(reshape2)

if(!require(plyr)){install.packages("plyr")}
require(plyr)

if(!require(gridExtra)){install.packages("gridExtra")}
require(gridExtra)


#--------------------------------------------------#
# Plotting in R using grammar of graphics (ggplot2) 
#--------------------------------------------------#

#### 1. Basic scatter plot ####

# Explore the qplot help file
?qplot


# Explore the Iris dataset
data(iris)
?iris
head(iris)
str(iris)
names(iris)

# Most basic scatter plot
basic.plot<-qplot(data=iris,
                  x=Sepal.Length,
                  y=Sepal.Width)
print(basic.plot)

# Most basic scatter plot (categorical data)
categorical.plot<-qplot(data=iris,
                        x=Species,
                        y=Sepal.Width)
print(categorical.plot)


# Edited most basic scatter plot
basic.plot<-qplot(data=iris,
                  x=Sepal.Length,
                  xlab="Sepal Length (mm)",
                  y=Sepal.Width,
                  ylab="Sepal Width (mm)",
                  main="Sepal dimensions")
print(basic.plot)


#-------------#
# Challenge 1 # 
#-------------#


# SOLUTION:
?CO2
data(CO2)
qplot(data = CO2,
      x = conc,
      xlab = "Concentration de CO2 (mL/L)",
      y = uptake,
      ylab = "Absorption de CO2 (umol/m^2 sec)",
      main = "Absorption de CO2 chez une espèce de graminée")


#### 2. Grammar of graphics ####

#### 2.1 Adding aesthetics and geoms ####

# Compare syntax for qplot() vs ggplot()

qplot(data=iris,
      x=Sepal.Length,
      xlab="Sepal Length (mm)",
      y=Sepal.Width,
      ylab="Sepal Width (mm)",
      main="Sepal dimensions")

ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point()+
  xlab("Sepal Length (mm)")+
  ylab("Sepal Width (mm)")+
  ggtitle("Sepal dimensions")

# Assign ggplot to object

basic.plot <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) +
                geom_point()+
                xlab("Sepal Length (mm)")+
                ylab("Sepal Width (mm)")+
                ggtitle("Sepal dimensions")

# Adding aesthetic to basic scatter plot
basic.plot <- basic.plot +
              aes(colour = Species, shape = Species)

# Adding geoms to basic scatter plot
linear.smooth.plot <- basic.plot + 
                      geom_smooth(method="lm", se = FALSE)
print(linear.smooth.plot)

#-------------#
# Challenge 2 # 
#-------------#

# Explore the CO2 dataset
data(CO2)
?CO2
head(CO2)
str(CO2)
names(CO2)

# Solution
data(CO2)
CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = Treatment)) +
  geom_point() +
  xlab("Concentration en CO2 (mL/L)") +
  ylab("Absorption de CO2 (umol/m^2 sec)") +    
  ggtitle("Absorption de CO2 par une espèce de graminée")
CO2.plot <- CO2.plot + geom_smooth(method = "lm")
print(CO2.plot)

# Could also create a smoothed curve and also colour you SE intervals by factor
CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = factor(Treatment))) +
  geom_point() +
  xlab("Concentration en CO2 (mL/L)") +
  ylab("Absorption de CO2 (umol/m^2 sec)") +    
  ggtitle("Absorption de CO2 par une espèce de graminée")
CO2.plot <- CO2.plot + geom_smooth(method = "loess",aes(fill=factor(Treatment)))
print(CO2.plot)


#### 2.2 Adding facets and groups ####

# basic plot from CO2
CO2.plot <- ggplot(data = CO2, aes(x=conc, y=uptake, colour= Treatment)) +
              geom_point()
 
print(CO2.plot)

# Adding facets
CO2.plot<-CO2.plot + facet_grid(.~Type)
print(CO2.plot)

# Adding line geoms
print(CO2.plot + geom_line())

# Specifying groups
CO2.plot <- CO2.plot + 
  geom_line(aes(group = Plant))
print(CO2.plot)

#-------------#
# Challenge 3 # 
#-------------#

data(msleep)
data(OrchardSprays)

# Solution

data(OrchardSprays)
box.plot  <- ggplot(data = OrchardSprays, aes(x = treatment, y = decrease)) +
  geom_boxplot()
print(box.plot)

#------------#
# Exercise 3 # 
#------------#


#### 3. Saving plots ####

pdf(“./plots/todays_plots.pdf”)
print(basic.plot)
print(plot.with.linear.smooth)
print(categorical.plot)
print(CO2.plot)
graphics.off()


#### 4. Fine tunning ####

#### 4.1 Fine tunning colours ####
CO2.plot+scale_colour_manual(values=c("nonchilled"="red","chilled"="blue"))

# Bonus!!! RColorBrewer
if(!require(RcolorBrewer)) {install.packages("RColorBrewer")}
require(RColorBrewer)

basic.plot + scale_color_brewer(palette="Dark2")


# Bonus!!! Wes Anderson colour palette
if(!require(devtools)) {install.packages("devtools")}
require(devtools)
devtools::install_github("wesanderson","karthik")
require(wesanderson)

basic.plot + 
  scale_color_manual(values = wes.palette(3, "Darjeeling")) 

#### 4.2 Fine tunning scales ####
CO2.plot + scale_y_continuous(name = "CO2 uptake rate",
                              breaks = seq(5,50, by= 10),
                              labels = seq(5,50, by= 10), 
                              trans="log10")

#### 4.3 Fine tunning themes ####
CO2.plot + theme_bw()

# BONUS: ggtheme package
if(!require(ggthemes)) {install.packages("ggthemes")}
require(ggthemes)

CO2.plot + theme_tufte()


#-------------#
# base R plot #
#-------------#
plot(iris)
lm <- lm(Sepal.Length~Petal.Width, data = iris)
x11()
plot(lm)

#### 5. Bonus! - Ecologists who may become vegan users ####

install_github("ggvegan", "gavinsimpson")
require(ggvegan)
data(dune)
data(dune.env)
sol <- cca(dune ~ A1 + Management, data = dune.env)
autoplot(sol)
data(mite)
data(mite.env)
mite.hel = decostand(mite, "hel")
rda <- rda(mite.hel ~ WatrCont + Shrub, mite.env)  # Model with all explanatory variables
x11()
ggvegan.plot <- autoplot(rda) + theme_bw()
normal.plot <- plot(rda)


#------------------------------------------------------#
# Reshaping and aggregating your data (reshape2 & plyr) 
#------------------------------------------------------#

# Source materials:

# Reshape / Reshape2
# http://seananderson.ca/2013/10/19/reshape.html
# http://stat405.had.co.nz/lectures/11-adv-data-manip.pdf
# http://stat405.had.co.nz/lectures/19-tables.pdf
# http://had.co.nz/reshape/introduction.pdf
# http://cran.r-project.org/web/packages/reshape2/reshape2.pdf

# Plyr
# http://seananderson.ca/courses/12-plyr/plyr_2012.pdf


# Data 
# In addition to iris and CO2, we will use the built-in datasets "airquality" and "ChickWeight"
# Explore the datasets

?airquality
str(airquality)
head(airquality)
names(airquality)

?ChickWeight
head(ChickWeight)
str(ChickWeight)
names(ChickWeight)

# You can also use the following code to find other datasets available in R:
data()


#### 6. Reshape ####

# Why reshape your data?


#### 6.1 Wide vs. Long data ####

# The built-in "iris"  dataset is in "wide" format with variables used as column names
head(iris)

# "long" format data has a column for possible variable types 
# and a column for the values of those variables.

# The format of your data depends on your specific needs,
# but some functions and packages (such as ggplot2) work well with long format data.

# Additionally, long form data can more easily be aggregated and converted
# back into wide form data to provide summaries, or check the balance of sampling designs.


# We can use the "reshape2" package by Hadley Wickham to:
# 1."melt" our data (wide --> long)
# 2."cast" our data (long --> wide)

# Think of working with metal: if you melt metal, it drips and becomes long. 
# If you cast it into a mould, it becomes wide.


#### 6.2 Melt ####
## MAKING YOUR DATA LONG ##

?melt

# Let's try to "melt" the iris dataset using the defalut arguments

iris.long <- melt(iris)
# Note the message "Using Species as id variables" appears in the console.

head(iris.long)
tail(iris.long)

# melt by default assumes that all columns with class "numeric" 
# are variables to be melted into a "variable" and a "value" column.

# Often this is what you will want to do, but if you have nested factors, you may want to specify the id variables. 

# You need to tell melt which of your variables are id variables, and which are measured variables.
# If you only supply one of id.vars and measure.vars, melt will assume the remainder of the
# variables in the data set belong to the other. If you supply neither, melt will assume factor and
# character variables are id variables, and all others are measured.


# Let's try this with the C02 dataset:

CO2.long <- melt(CO2)
head(CO2)
head(CO2.long)
tail(CO2.long)

# This time melt has used combinations of Plant, Type, and Treatment as id variables, 
# each of which has associated measurements for concentration (conc) and uptake


# *********************************************************************************************** #
# CHALLENGE: Which variables in the ChickWeight dataset will be used as default id variables? Why?
# *********************************************************************************************** #

# SOLUTION: 

?ChickWeight
str(ChickWeight)

# "Chick" and "Diet" will be used as default id variables because they are factors, 
# whereas "Time" and "weight" are numeric vectors. Try melting ChickWeight and see for yourself!

# *********************************************************************************************** #


# BONUS!!!  You can also specify id variables by name if needed AND you can 
#           change the column names of your "variable" and "value" in one step:

iris.long <- melt(iris, id.vars = c("Species"),
                  variable.name = "Part_measured", 
                  value.name = "Measurement_cm")
head(iris.long)


#### 6.3 Cast ####
## MAKING YOUR DATA WIDE ##

?cast

# With cast you can not only put your data back into "wide" format, 
# but can also subset, aggregate and get summary stats with one line of code!

# The cast function has two versions depending on the class of object you want cast to output:
# "dcast" produces a data frame
# "acast" produces a vector, matrix, or array 

# Most of the time ecologists work with data frames, so we'll stick to that for now.


# CASTING BASICS:

# cast uses a formula to tell it which melted id variables and the measurement variables to cast:
# ex: object <- dcast(data, id.variable.1 + id.variable.2 ~ measured.variable)


# ************************************************************************************************ #
# CHALLENGE: Try to melt and re-cast the airquality to return the same format as the original data.
# ************************************************************************************************ #

# SOLUTION: 

?airquality
names(airquality)
air.long <- melt(airquality)
head(air.long)
# Notice that since all the variables are either numeric or integer, melt did not use any id variables
# We must specify that we want "Month" and "Day" to be the id variables:

air.long <- melt(airquality, id.vars = c("Month", "Day"))
head(air.long)

air.wide <- dcast(air.long, Month + Day ~ variable)
head(air.wide)

# Now air.wide is back in the same format as the original airquality (although the order of columns is changed)

# ************************************************************************************************ #


#### 6.4 Combining ggplot with reshape ####

##Example with the air quality dataset on using both wide and long data formats 
head(airquality)
# The dataset is in wide format, where measured variables
# (ozone, solar.r, wind and temp) are placed in their own columns.


# Diagnostic plots using the wide format + ggplot2

# 1:  Visualize each individual variable and the range it displays for each month in the timeseries

fMonth<- factor(airquality$Month) #Convert the Month variable to a factor. 

ozone.box <- ggplot(airquality, aes(x=fMonth, y=Ozone)) + geom_boxplot()
solar.box <- ggplot(airquality, aes(x=fMonth, y=Solar.R)) + geom_boxplot()
temp.box  <- ggplot(airquality, aes(x=fMonth, y=Temp)) + geom_boxplot()
wind.box  <- ggplot(airquality, aes(x=fMonth, y=Wind)) + geom_boxplot()


# You can use grid.arrange() in the package gridExtra to put these plots into 1 figure. 

combo.box <- grid.arrange(ozone.box, solar.box, temp.box, wind.box, nrow=2) 
# nrow = number of rows you would like the plots displayed on.

#This arranges the 4 separate plots into one panel for viewing. 
#Note that the scales on the individual y-axes are not the same. 


# 2: You can continue using the wide format of the airquality dataset to make 
#       individual plots of each variable showing day measurements for each month. 

ozone.plot <- ggplot(airquality, aes(x=Day, y=Ozone)) + geom_point() + geom_smooth()
ozone.plot <- ozone.plot + facet_wrap(~Month, nrow=2)

solar.plot <- ggplot(airquality, aes(x=Day, y=Solar.R)) + geom_point() +geom_smooth()
solar.plot <- solar.plot + facet_wrap(~Month, nrow=2)

wind.plot <- ggplot(airquality, aes(x=Day, y=Wind)) + geom_point() +geom_smooth()
wind.plot <- wind.plot + facet_wrap(~Month, nrow=2)

temp.plot <- ggplot(airquality, aes(x=Day, y=Temp)) + geom_point() +geom_smooth()
temp.plot <- temp.plot + facet_wrap(~Month, nrow=2)

#You could even then combine these different faceted plots together:
# (though it looks pretty ugly at the moment) 
combo.facets <- grid.arrange(ozone.plot, solar.plot, wind.plot, temp.plot, nrow=4)


# BUT, what if I'd like to use facet_wrap() for the variables 
# as opposed to by month or put all variables on oneplot? 

# Change data from wide to long format (See back to Section 6.3). 

air.long <- melt(airquality, id.vars = c("Month", "Day"))
head(air.long)

air.wide <- dcast(air.long, Month + Day ~ variable)
head(air.wide)

# Use air.long
fMonth.long <- factor(air.long$Month)

weather <- ggplot(air.long, aes(x=fMonth.long, y=value)) + geom_boxplot()
weather <- weather + facet_wrap(~variable, nrow=2)

# Compare the "weather" plot with "combo.box"
# This is the same data but working with it in wide versus long format has allowed us to make different looking plots.

# The weather plot uses facet_wrap to put all the individual variables on the same scale. 
# This may be useful in many circumstances. However, using the facet_wrap means that 
# we don't see all the variation present in the wind variable.
# In that case, you can modify the code to allow the scales to be determined per facet.

weather <- weather + facet_wrap(~variable, nrow=2, scales="free")

weather

# We can also use the long format data (air.long) to create a plot with 
# all the variables included on a single plot:

weather2 <- ggplot(air.long, aes(x=Day, y=value, colour=variable))+ geom_point()#this plot will put all the day measurements on one plot
weather2 <- weather2 + facet_wrap(~Month, nrow=1) #add this part and again, the observations are split by month

weather2


#### 6.5 Aggregating data with cast ####

# With cast's fun.aggregate argument you can quickly get summary statistics of your data.

# Note: Whenever there are fewer cells in the cast form than there were in the original data format, 
# an aggregation function is necessary. If fun.aggregate is not specified the default function is "length"

# With cast you can also use some additional syntax to simplify your formulas:
# "." corresponds to no variable, useful when creating formulas of the form . ∼ x or x ∼ .
# "..." represents all variables not previously included in the casting formula. 


# EXAMPLE: Checking for a balanced study design

# With cast, you can provide an aggregation function with the argument "fun.aggregate = xxx" 
# To get the number of samples per level of a factor you can use the function "length" 
# and use "..." to tell cast to use all measurement variables
head(iris.long)

dcast(iris.long, Species ~ ..., fun.aggregate = length)

# As there are 50 observations per species per measured variable, we see that sample sizes are equal

# If we wanted to check whether or not the number of observations is equal among species or among Part_measures,
# we just modify the formula slightly so as to not include any response variable by using the "."

dcast(iris.long, Species ~ ., fun.aggregate = length)
dcast(iris.long, Part_measured ~ ., fun.aggregate = length)


# EXAMPLE: Calculating central tendencies

dcast(iris.long, Species ~ Part_measured, fun.aggregate = mean)
dcast(iris.long, Species ~ Part_measured, fun.aggregate = median)

# Try with other functions, but note that id the output of a function returns multiple vectors, (i.e. range, or summary), 
# you will have to play with cast a little more to get the right output (see http://had.co.nz/reshape/introduction.pdf section 4.4)


# ****************************************************************************************** #
# CHALLENGE: Determine which month had the highest mean temperature in the airquality dataset
# ****************************************************************************************** #

# SOLUTION: 
month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean)
month_means
# We can see it is Month 8, although if you have many levels of a factor, 
# you may want to write a line of code to pull out this for you (this is better as is less prone to human error!)

# Ex:
month_means$Month[ which(month_means$Temp == max(month_means$Temp)) ]

# *********


# *********
# CHALLENGE: Determine which month had the highest mean Ozone level in the airquality dataset
# *********

# SOLUTION: 
month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean)
month_means

# All Ozone means are NA!!!
# This is because there are NAs in the original dataset.
# With cast you can specify whether or not to remove NAs before aggregation:

month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean, na.rm = TRUE)
month_means

month_means$Month[ which( month_means$Ozone == max(month_means$Ozone) )]

# *********


# BONUS! "RECAST"

# If you feel comfortable with the melt and cast functions, you can both melt and cast your data at once
# with the recast function.

?recast

# Getting means of environmental variables in the airquality dataset with recast:

recast(airquality, Month ~ variable , fun.aggregate = mean, id.var=c("Month","Day"), na.rm=T)


#### 7. Plyr ####
## MEGA DATA MANIPULATION ##

# plyr provides a set of tools for manipulating data.
# The basic format of plyr is "Split-Apply-Combine" which translates to:
# "Split up some data, perform some function over each of the parts, and comine them back together"

# plyr builds on the built-in apply functions by giving you control over the input and
# output formats and keeping the syntax consistent across all variations.

# The basic format is 2 letters followed by ply(). 
# The ﬁrst letter refers to the format in and the second to the format out.

# The 3 main letters are:
# 1. d = data frame
# 2. a = array (includes matrices)
# 3. l = list

# Since we are dealing with data frames in this workshop, we will stick to ddply, 
# but keep in mind the versatility that plyr allows for dealing with different data types.


# Example: Creating a new data frame with mean and standard deviation of Temperature per month (airquality dataset)

head(airquality)

ddply(airquality, "Month", summarize, 
      mean_Temp = mean(Temp, na.rm=T), 
      sd_Temp = sd(Temp, na.rm=T))


# Augmenting our function to round our created variables to two decimal places:
?round

ddply(airquality, "Month", summarize, 
      mean_Temp = round(mean(Temp, na.rm=T), digits = 2), 
      sd_Temp = round(sd(Temp, na.rm=T), digits = 2))


# If we want to retain our original data frame and simply add in our created variables, 
# we can use "transform" instead of "summarize"

head(ddply(airquality, "Month", transform, 
      mean_Temp = round(mean(Temp, na.rm=T), digits = 2), 
      sd_Temp = round(sd(Temp, na.rm=T), digits = 2)))


# Additionally, if we want to build upon our created variables to create additional columns, we can use "mutate"

# Example: Creating a column with the coefficient of variation of temperature per month:

head( ddply(airquality, "Month", mutate, 
      mean_Temp = round(mean(Temp, na.rm=T), digits = 2), 
      sd_Temp = round(sd(Temp, na.rm=T), digits = 2),
      cv_Temp = sd_Temp/mean_Temp))


# BONUS: If you are just interested in using "summarize" and not "transform", you can already refer to created variables using summarize:

ddply(airquality, "Month", summarize, 
      mean_Temp = round(mean(Temp, na.rm=T), digits = 2), 
      sd_Temp = round(sd(Temp, na.rm=T), digits = 2),
      cv_Temp = sd_Temp/mean_Temp)


# ******************************************************************************************************************************************** #
# CHALLENGE: Create a data frame which diplays the total weight gain of individual chicks and the diet they were supplied (ChickWeight dataset)
# ******************************************************************************************************************************************** #

# SOLUTION: 

names(ChickWeight)
# Use "summarize" with "Diet" and "Chick" as variables.
# Create a new column which subtracts the maximum weight from the minimum weight per chick.
# Note: Since Chick is nested within Diet, we are adding Diet simply to have it retainted in our output

ddply(ChickWeight, c("Diet", "Chick"), summarize, total_growth = (max(weight) - min(weight)))

# ******************************************************************************************************************************************** #


# Check out R style guides to help format your scripts for easy reading:
# https://google-styleguide.googlecode.com/svn/trunk/Rguide.xml