####################################################################### ## QCBS R Workshop Series ## ## GGPLOT // RESHAPE // PLYR ## ## Author: Quebec Center for Biodiversity Science ## Materials Generated & Amalgamated by: Amanda Winegardner, Xavier Giroux-Bougard, Maxwell Farrell, Etienne Low-Decarie ## Last updated: July 29th 2014 ## Built under R version 3.1.0 #### 0. Housekeeping #### # Clean up your current working directory rm(list=ls()) # Install and/or load required packages if(!require(ggplot2)){install.packages("ggplot2")} require(ggplot2) if(!require(reshape2)){install.packages("reshape2")} require(reshape2) if(!require(plyr)){install.packages("plyr")} require(plyr) if(!require(gridExtra)){install.packages("gridExtra")} require(gridExtra) #--------------------------------------------------# # Plotting in R using grammar of graphics (ggplot2) #--------------------------------------------------# #### 1. Basic scatter plot #### # Explore the qplot help file ?qplot # Explore the Iris dataset data(iris) ?iris head(iris) str(iris) names(iris) # Most basic scatter plot basic.plot<-qplot(data=iris, x=Sepal.Length, y=Sepal.Width) print(basic.plot) # Most basic scatter plot (categorical data) categorical.plot<-qplot(data=iris, x=Species, y=Sepal.Width) print(categorical.plot) # Edited most basic scatter plot basic.plot<-qplot(data=iris, x=Sepal.Length, xlab="Sepal Length (mm)", y=Sepal.Width, ylab="Sepal Width (mm)", main="Sepal dimensions") print(basic.plot) #-------------# # Challenge 1 # #-------------# # SOLUTION: ?CO2 data(CO2) qplot(data = CO2, x = conc, xlab = "Concentration de CO2 (mL/L)", y = uptake, ylab = "Absorption de CO2 (umol/m^2 sec)", main = "Absorption de CO2 chez une espèce de graminée") #### 2. Grammar of graphics #### #### 2.1 Adding aesthetics and geoms #### # Compare syntax for qplot() vs ggplot() qplot(data=iris, x=Sepal.Length, xlab="Sepal Length (mm)", y=Sepal.Width, ylab="Sepal Width (mm)", main="Sepal dimensions") ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) + geom_point()+ xlab("Sepal Length (mm)")+ ylab("Sepal Width (mm)")+ ggtitle("Sepal dimensions") # Assign ggplot to object basic.plot <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) + geom_point()+ xlab("Sepal Length (mm)")+ ylab("Sepal Width (mm)")+ ggtitle("Sepal dimensions") # Adding aesthetic to basic scatter plot basic.plot <- basic.plot + aes(colour = Species, shape = Species) # Adding geoms to basic scatter plot linear.smooth.plot <- basic.plot + geom_smooth(method="lm", se = FALSE) print(linear.smooth.plot) #-------------# # Challenge 2 # #-------------# # Explore the CO2 dataset data(CO2) ?CO2 head(CO2) str(CO2) names(CO2) # Solution data(CO2) CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = Treatment)) + geom_point() + xlab("Concentration en CO2 (mL/L)") + ylab("Absorption de CO2 (umol/m^2 sec)") + ggtitle("Absorption de CO2 par une espèce de graminée") CO2.plot <- CO2.plot + geom_smooth(method = "lm") print(CO2.plot) # Could also create a smoothed curve and also colour you SE intervals by factor CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = factor(Treatment))) + geom_point() + xlab("Concentration en CO2 (mL/L)") + ylab("Absorption de CO2 (umol/m^2 sec)") + ggtitle("Absorption de CO2 par une espèce de graminée") CO2.plot <- CO2.plot + geom_smooth(method = "loess",aes(fill=factor(Treatment))) print(CO2.plot) #### 2.2 Adding facets and groups #### # basic plot from CO2 CO2.plot <- ggplot(data = CO2, aes(x=conc, y=uptake, colour= Treatment)) + geom_point() print(CO2.plot) # Adding facets CO2.plot<-CO2.plot + facet_grid(.~Type) print(CO2.plot) # Adding line geoms print(CO2.plot + geom_line()) # Specifying groups CO2.plot <- CO2.plot + geom_line(aes(group = Plant)) print(CO2.plot) #-------------# # Challenge 3 # #-------------# data(msleep) data(OrchardSprays) # Solution data(OrchardSprays) box.plot <- ggplot(data = OrchardSprays, aes(x = treatment, y = decrease)) + geom_boxplot() print(box.plot) #------------# # Exercise 3 # #------------# #### 3. Saving plots #### pdf(“./plots/todays_plots.pdf”) print(basic.plot) print(plot.with.linear.smooth) print(categorical.plot) print(CO2.plot) graphics.off() #### 4. Fine tunning #### #### 4.1 Fine tunning colours #### CO2.plot+scale_colour_manual(values=c("nonchilled"="red","chilled"="blue")) # Bonus!!! RColorBrewer if(!require(RcolorBrewer)) {install.packages("RColorBrewer")} require(RColorBrewer) basic.plot + scale_color_brewer(palette="Dark2") # Bonus!!! Wes Anderson colour palette if(!require(devtools)) {install.packages("devtools")} require(devtools) devtools::install_github("wesanderson","karthik") require(wesanderson) basic.plot + scale_color_manual(values = wes.palette(3, "Darjeeling")) #### 4.2 Fine tunning scales #### CO2.plot + scale_y_continuous(name = "CO2 uptake rate", breaks = seq(5,50, by= 10), labels = seq(5,50, by= 10), trans="log10") #### 4.3 Fine tunning themes #### CO2.plot + theme_bw() # BONUS: ggtheme package if(!require(ggthemes)) {install.packages("ggthemes")} require(ggthemes) CO2.plot + theme_tufte() #-------------# # base R plot # #-------------# plot(iris) lm <- lm(Sepal.Length~Petal.Width, data = iris) x11() plot(lm) #### 5. Bonus! - Ecologists who may become vegan users #### install_github("ggvegan", "gavinsimpson") require(ggvegan) data(dune) data(dune.env) sol <- cca(dune ~ A1 + Management, data = dune.env) autoplot(sol) data(mite) data(mite.env) mite.hel = decostand(mite, "hel") rda <- rda(mite.hel ~ WatrCont + Shrub, mite.env) # Model with all explanatory variables x11() ggvegan.plot <- autoplot(rda) + theme_bw() normal.plot <- plot(rda) #------------------------------------------------------# # Reshaping and aggregating your data (reshape2 & plyr) #------------------------------------------------------# # Source materials: # Reshape / Reshape2 # http://seananderson.ca/2013/10/19/reshape.html # http://stat405.had.co.nz/lectures/11-adv-data-manip.pdf # http://stat405.had.co.nz/lectures/19-tables.pdf # http://had.co.nz/reshape/introduction.pdf # http://cran.r-project.org/web/packages/reshape2/reshape2.pdf # Plyr # http://seananderson.ca/courses/12-plyr/plyr_2012.pdf # Data # In addition to iris and CO2, we will use the built-in datasets "airquality" and "ChickWeight" # Explore the datasets ?airquality str(airquality) head(airquality) names(airquality) ?ChickWeight head(ChickWeight) str(ChickWeight) names(ChickWeight) # You can also use the following code to find other datasets available in R: data() #### 6. Reshape #### # Why reshape your data? #### 6.1 Wide vs. Long data #### # The built-in "iris" dataset is in "wide" format with variables used as column names head(iris) # "long" format data has a column for possible variable types # and a column for the values of those variables. # The format of your data depends on your specific needs, # but some functions and packages (such as ggplot2) work well with long format data. # Additionally, long form data can more easily be aggregated and converted # back into wide form data to provide summaries, or check the balance of sampling designs. # We can use the "reshape2" package by Hadley Wickham to: # 1."melt" our data (wide --> long) # 2."cast" our data (long --> wide) # Think of working with metal: if you melt metal, it drips and becomes long. # If you cast it into a mould, it becomes wide. #### 6.2 Melt #### ## MAKING YOUR DATA LONG ## ?melt # Let's try to "melt" the iris dataset using the defalut arguments iris.long <- melt(iris) # Note the message "Using Species as id variables" appears in the console. head(iris.long) tail(iris.long) # melt by default assumes that all columns with class "numeric" # are variables to be melted into a "variable" and a "value" column. # Often this is what you will want to do, but if you have nested factors, you may want to specify the id variables. # You need to tell melt which of your variables are id variables, and which are measured variables. # If you only supply one of id.vars and measure.vars, melt will assume the remainder of the # variables in the data set belong to the other. If you supply neither, melt will assume factor and # character variables are id variables, and all others are measured. # Let's try this with the C02 dataset: CO2.long <- melt(CO2) head(CO2) head(CO2.long) tail(CO2.long) # This time melt has used combinations of Plant, Type, and Treatment as id variables, # each of which has associated measurements for concentration (conc) and uptake # *********************************************************************************************** # # CHALLENGE: Which variables in the ChickWeight dataset will be used as default id variables? Why? # *********************************************************************************************** # # SOLUTION: ?ChickWeight str(ChickWeight) # "Chick" and "Diet" will be used as default id variables because they are factors, # whereas "Time" and "weight" are numeric vectors. Try melting ChickWeight and see for yourself! # *********************************************************************************************** # # BONUS!!! You can also specify id variables by name if needed AND you can # change the column names of your "variable" and "value" in one step: iris.long <- melt(iris, id.vars = c("Species"), variable.name = "Part_measured", value.name = "Measurement_cm") head(iris.long) #### 6.3 Cast #### ## MAKING YOUR DATA WIDE ## ?cast # With cast you can not only put your data back into "wide" format, # but can also subset, aggregate and get summary stats with one line of code! # The cast function has two versions depending on the class of object you want cast to output: # "dcast" produces a data frame # "acast" produces a vector, matrix, or array # Most of the time ecologists work with data frames, so we'll stick to that for now. # CASTING BASICS: # cast uses a formula to tell it which melted id variables and the measurement variables to cast: # ex: object <- dcast(data, id.variable.1 + id.variable.2 ~ measured.variable) # ************************************************************************************************ # # CHALLENGE: Try to melt and re-cast the airquality to return the same format as the original data. # ************************************************************************************************ # # SOLUTION: ?airquality names(airquality) air.long <- melt(airquality) head(air.long) # Notice that since all the variables are either numeric or integer, melt did not use any id variables # We must specify that we want "Month" and "Day" to be the id variables: air.long <- melt(airquality, id.vars = c("Month", "Day")) head(air.long) air.wide <- dcast(air.long, Month + Day ~ variable) head(air.wide) # Now air.wide is back in the same format as the original airquality (although the order of columns is changed) # ************************************************************************************************ # #### 6.4 Combining ggplot with reshape #### ##Example with the air quality dataset on using both wide and long data formats head(airquality) # The dataset is in wide format, where measured variables # (ozone, solar.r, wind and temp) are placed in their own columns. # Diagnostic plots using the wide format + ggplot2 # 1: Visualize each individual variable and the range it displays for each month in the timeseries fMonth<- factor(airquality$Month) #Convert the Month variable to a factor. ozone.box <- ggplot(airquality, aes(x=fMonth, y=Ozone)) + geom_boxplot() solar.box <- ggplot(airquality, aes(x=fMonth, y=Solar.R)) + geom_boxplot() temp.box <- ggplot(airquality, aes(x=fMonth, y=Temp)) + geom_boxplot() wind.box <- ggplot(airquality, aes(x=fMonth, y=Wind)) + geom_boxplot() # You can use grid.arrange() in the package gridExtra to put these plots into 1 figure. combo.box <- grid.arrange(ozone.box, solar.box, temp.box, wind.box, nrow=2) # nrow = number of rows you would like the plots displayed on. #This arranges the 4 separate plots into one panel for viewing. #Note that the scales on the individual y-axes are not the same. # 2: You can continue using the wide format of the airquality dataset to make # individual plots of each variable showing day measurements for each month. ozone.plot <- ggplot(airquality, aes(x=Day, y=Ozone)) + geom_point() + geom_smooth() ozone.plot <- ozone.plot + facet_wrap(~Month, nrow=2) solar.plot <- ggplot(airquality, aes(x=Day, y=Solar.R)) + geom_point() +geom_smooth() solar.plot <- solar.plot + facet_wrap(~Month, nrow=2) wind.plot <- ggplot(airquality, aes(x=Day, y=Wind)) + geom_point() +geom_smooth() wind.plot <- wind.plot + facet_wrap(~Month, nrow=2) temp.plot <- ggplot(airquality, aes(x=Day, y=Temp)) + geom_point() +geom_smooth() temp.plot <- temp.plot + facet_wrap(~Month, nrow=2) #You could even then combine these different faceted plots together: # (though it looks pretty ugly at the moment) combo.facets <- grid.arrange(ozone.plot, solar.plot, wind.plot, temp.plot, nrow=4) # BUT, what if I'd like to use facet_wrap() for the variables # as opposed to by month or put all variables on oneplot? # Change data from wide to long format (See back to Section 6.3). air.long <- melt(airquality, id.vars = c("Month", "Day")) head(air.long) air.wide <- dcast(air.long, Month + Day ~ variable) head(air.wide) # Use air.long fMonth.long <- factor(air.long$Month) weather <- ggplot(air.long, aes(x=fMonth.long, y=value)) + geom_boxplot() weather <- weather + facet_wrap(~variable, nrow=2) # Compare the "weather" plot with "combo.box" # This is the same data but working with it in wide versus long format has allowed us to make different looking plots. # The weather plot uses facet_wrap to put all the individual variables on the same scale. # This may be useful in many circumstances. However, using the facet_wrap means that # we don't see all the variation present in the wind variable. # In that case, you can modify the code to allow the scales to be determined per facet. weather <- weather + facet_wrap(~variable, nrow=2, scales="free") weather # We can also use the long format data (air.long) to create a plot with # all the variables included on a single plot: weather2 <- ggplot(air.long, aes(x=Day, y=value, colour=variable))+ geom_point()#this plot will put all the day measurements on one plot weather2 <- weather2 + facet_wrap(~Month, nrow=1) #add this part and again, the observations are split by month weather2 #### 6.5 Aggregating data with cast #### # With cast's fun.aggregate argument you can quickly get summary statistics of your data. # Note: Whenever there are fewer cells in the cast form than there were in the original data format, # an aggregation function is necessary. If fun.aggregate is not specified the default function is "length" # With cast you can also use some additional syntax to simplify your formulas: # "." corresponds to no variable, useful when creating formulas of the form . ∼ x or x ∼ . # "..." represents all variables not previously included in the casting formula. # EXAMPLE: Checking for a balanced study design # With cast, you can provide an aggregation function with the argument "fun.aggregate = xxx" # To get the number of samples per level of a factor you can use the function "length" # and use "..." to tell cast to use all measurement variables head(iris.long) dcast(iris.long, Species ~ ..., fun.aggregate = length) # As there are 50 observations per species per measured variable, we see that sample sizes are equal # If we wanted to check whether or not the number of observations is equal among species or among Part_measures, # we just modify the formula slightly so as to not include any response variable by using the "." dcast(iris.long, Species ~ ., fun.aggregate = length) dcast(iris.long, Part_measured ~ ., fun.aggregate = length) # EXAMPLE: Calculating central tendencies dcast(iris.long, Species ~ Part_measured, fun.aggregate = mean) dcast(iris.long, Species ~ Part_measured, fun.aggregate = median) # Try with other functions, but note that id the output of a function returns multiple vectors, (i.e. range, or summary), # you will have to play with cast a little more to get the right output (see http://had.co.nz/reshape/introduction.pdf section 4.4) # ****************************************************************************************** # # CHALLENGE: Determine which month had the highest mean temperature in the airquality dataset # ****************************************************************************************** # # SOLUTION: month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean) month_means # We can see it is Month 8, although if you have many levels of a factor, # you may want to write a line of code to pull out this for you (this is better as is less prone to human error!) # Ex: month_means$Month[ which(month_means$Temp == max(month_means$Temp)) ] # ********* # ********* # CHALLENGE: Determine which month had the highest mean Ozone level in the airquality dataset # ********* # SOLUTION: month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean) month_means # All Ozone means are NA!!! # This is because there are NAs in the original dataset. # With cast you can specify whether or not to remove NAs before aggregation: month_means <- dcast(air.long, Month ~ variable, fun.aggregate = mean, na.rm = TRUE) month_means month_means$Month[ which( month_means$Ozone == max(month_means$Ozone) )] # ********* # BONUS! "RECAST" # If you feel comfortable with the melt and cast functions, you can both melt and cast your data at once # with the recast function. ?recast # Getting means of environmental variables in the airquality dataset with recast: recast(airquality, Month ~ variable , fun.aggregate = mean, id.var=c("Month","Day"), na.rm=T) #### 7. Plyr #### ## MEGA DATA MANIPULATION ## # plyr provides a set of tools for manipulating data. # The basic format of plyr is "Split-Apply-Combine" which translates to: # "Split up some data, perform some function over each of the parts, and comine them back together" # plyr builds on the built-in apply functions by giving you control over the input and # output formats and keeping the syntax consistent across all variations. # The basic format is 2 letters followed by ply(). # The first letter refers to the format in and the second to the format out. # The 3 main letters are: # 1. d = data frame # 2. a = array (includes matrices) # 3. l = list # Since we are dealing with data frames in this workshop, we will stick to ddply, # but keep in mind the versatility that plyr allows for dealing with different data types. # Example: Creating a new data frame with mean and standard deviation of Temperature per month (airquality dataset) head(airquality) ddply(airquality, "Month", summarize, mean_Temp = mean(Temp, na.rm=T), sd_Temp = sd(Temp, na.rm=T)) # Augmenting our function to round our created variables to two decimal places: ?round ddply(airquality, "Month", summarize, mean_Temp = round(mean(Temp, na.rm=T), digits = 2), sd_Temp = round(sd(Temp, na.rm=T), digits = 2)) # If we want to retain our original data frame and simply add in our created variables, # we can use "transform" instead of "summarize" head(ddply(airquality, "Month", transform, mean_Temp = round(mean(Temp, na.rm=T), digits = 2), sd_Temp = round(sd(Temp, na.rm=T), digits = 2))) # Additionally, if we want to build upon our created variables to create additional columns, we can use "mutate" # Example: Creating a column with the coefficient of variation of temperature per month: head( ddply(airquality, "Month", mutate, mean_Temp = round(mean(Temp, na.rm=T), digits = 2), sd_Temp = round(sd(Temp, na.rm=T), digits = 2), cv_Temp = sd_Temp/mean_Temp)) # BONUS: If you are just interested in using "summarize" and not "transform", you can already refer to created variables using summarize: ddply(airquality, "Month", summarize, mean_Temp = round(mean(Temp, na.rm=T), digits = 2), sd_Temp = round(sd(Temp, na.rm=T), digits = 2), cv_Temp = sd_Temp/mean_Temp) # ******************************************************************************************************************************************** # # CHALLENGE: Create a data frame which diplays the total weight gain of individual chicks and the diet they were supplied (ChickWeight dataset) # ******************************************************************************************************************************************** # # SOLUTION: names(ChickWeight) # Use "summarize" with "Diet" and "Chick" as variables. # Create a new column which subtracts the maximum weight from the minimum weight per chick. # Note: Since Chick is nested within Diet, we are adding Diet simply to have it retainted in our output ddply(ChickWeight, c("Diet", "Chick"), summarize, total_growth = (max(weight) - min(weight))) # ******************************************************************************************************************************************** # # Check out R style guides to help format your scripts for easy reading: # https://google-styleguide.googlecode.com/svn/trunk/Rguide.xml