# QCBS R Workshop Series ## ## ggplot2 // tidyr // dplyr ## ## Author: Quebec Center for Biodiversity Science ## Materials Generated & Amalgamated by: ## Xavier Giroux-Bougard, Monica Granados, ## Maxwell Farrell, Etienne Low-Decarie ## Last updated: November 2nd 2016 ## Built under R version 3.1.3 #### 0. Housekeeping #### # Clean up your current working directory rm(list=ls()) # Install and/or load required packages if(!require(ggplot2)){install.packages("ggplot2")} require(ggplot2) if(!require(tidyr)){install.packages("tidyr")} require(tidyr) if(!require(dplyr)){install.packages("dplyr")} require(dplyr) if(!require(magrittr)){install.packages("magrittr")} require(magrittr) if(!require(gridExtra)){install.packages("gridExtra")} require(gridExtra) if(!require(viridis)){install.packages("viridis")} require(viridis) if(!require(devtools)){install.packages("devtools")} require(devtools) #------------------------------------------------------------# #### 1. Plotting in R using grammar of graphics (ggplot2) #### #------------------------------------------------------------# #### 1.1 Intro to ggplot2 #### #### 1.2 Simple plots using qplot() #### # Explore the qplot help file ?qplot # Explore the Iris dataset data(iris) ?iris head(iris) str(iris) names(iris) # Most basic scatter plot qplot(data = iris, x = Sepal.Length, y = Sepal.Width) # Most basic scatter plot (categorical data) qplot(data = iris, x = Species, y = Sepal.Width) # Basic scatter plot with labels/title qplot(data = iris, x = Sepal.Length, xlab = "Sepal Length (mm)", y = Sepal.Width, ylab = "Sepal Width (mm)", main = "Sepal dimensions") #------------------------------------------------------------------------------# #-----------------------------# #### ggplot2 - Challenge 1 #### #-----------------------------# # Using the qplot() function, build a basic scatter plot with a title # and axis labels from one of the CO2 or BOD data sets in R. You can load # these and explore their contents as follows: ?CO2 data(CO2) ?BOD data(BOD) # SOLUTION: qplot(data = CO2, x = conc, xlab = "Concentration de CO2 (mL/L)", y = uptake, ylab = "Absorption de CO2 (umol/m^2 sec)", main = "Absorption de CO2 chez une espèce de graminée") #------------------------------------------------------------------------------# #### 1.3 The Grammar of Graphics #### #### 1.4 Advanced plots using ggplot() #### # using qplot() qplot(data = iris, x = Sepal.Length, xlab = "Sepal Length (mm)", y = Sepal.Width, ylab = "Sepal Width (mm)", main = "Sepal dimensions") # equivalent code using ggplot() ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) + geom_point() + xlab("Sepal Length (mm)") + ylab("Sepal Width (mm)") + ggtitle("Sepal dimensions") # Assign ggplot to object basic.plot <- ggplot(data = iris, aes(x = Sepal.Length, y = Sepal.Width)) + geom_point()+ xlab("Sepal Length (mm)")+ ylab("Sepal Width (mm)")+ ggtitle("Sepal dimensions") #### 1.5 Adding colours and shapes #### basic.plot <- basic.plot + aes(colour = Species, shape = Species) basic.plot #### 1.6 Adding geometric objects #### linear.smooth.plot <- basic.plot + geom_smooth(method = "lm", se = FALSE) linear.smooth.plot #------------------------------------------------------------------------------# #------------------------------# #### ggplot2 - Challenge 2 #### #------------------------------# # Produce a colourful plot with linear regression (or other smoother) # from built in data such as the CO2 dataset or the msleep dataset: # Explore the CO2 dataset data(CO2) ?CO2 head(CO2) str(CO2) names(CO2) # Solution using a loess smoother data(CO2) CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = Treatment)) + geom_point() + xlab("CO2 Concentration (mL/L)") + ylab("CO2 Uptake (umol/m^2 sec)") + ggtitle("CO2 uptake in grass plants") + geom_smooth(method = "loess") CO2.plot # Could also create a smoothed curve and add colour you SE intervals by factor CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = factor(Treatment))) + geom_point() + xlab("CO2 Concentration (mL/L)") + ylab("CO2 Uptake (umol/m^2 sec)") + ggtitle("CO2 uptake in grass plants") + geom_smooth(method = "loess", aes(fill = factor(Treatment))) CO2.plot #------------------------------------------------------------------------------# #### 1.7 Adding multiple facets #### # basic plot from CO2 data(CO2) CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = Treatment)) + geom_point() + xlab("CO2 Concentration (mL/L)") + ylab("CO2 Uptake (umol/m^2 sec)") + ggtitle("CO2 uptake in grass plants") CO2.plot # Adding facets CO2.plot <- CO2.plot + facet_grid(. ~ Type) CO2.plot #### 1.8 Adding groups #### # Adding line geoms CO2.plot + geom_line() # Specifying groups CO2.plot <- CO2.plot + geom_line(aes(group = Plant)) CO2.plot #------------------------------------------------------------------------------# #-----------------------------# #### ggplot2 - Challenge 3 #### #-----------------------------# # Explore a new geom and other plot elements with your own data or built in data ?msleep data(msleep) ?OrchardSprays data(OrchardSprays) # SOLUTION data(OrchardSprays) box.plot <- ggplot(data = OrchardSprays, aes(x = treatment, y = decrease)) + geom_boxplot() box.plot #------------------------------------------------------------------------------# #### 1.9 Saving plots #### pdf("./plots/todays_plots.pdf") print(basic.plot) print(plot.with.linear.smooth) print(categorical.plot) print(CO2.plot) graphics.off() # with the ggsave() function ggsave("CO2graph.pdf", CO2.graph, height = 8.5, width = 11, units = "in") #### 1.10 Fine tuning - colours #### #manually CO2.plot + scale_colour_manual(values = c("nonchilled" = "red","chilled" = "blue")) #with hex colours CO2.plot + scale_colour_manual(values = c("#FF0000", "#1111e5")) #using the viridis palette viridis(2, alpha = 1, begin = 0, end = 1) #outputs the hex codes of for your to use in scale_colour_manual() CO2.plot + scale_colour_manual(values = viridis(2, option = "D")) # Bonus!!! RColorBrewer if(!require(RcolorBrewer)) {install.packages("RColorBrewer")} require(RColorBrewer) basic.plot + scale_color_brewer(palette="Dark2") # Bonus!!! Wes Anderson colour palette if(!require(devtools)) {install.packages("devtools")} library(devtools) devtools::install_github("wesanderson", "karthik") library(wesanderson) basic.plot + scale_color_manual(values = wes_palette("GrandBudapest", 3)) #### 1.11 Fine tuning axes and scales #### CO2.plot + scale_y_continuous(name = "CO2 uptake rate", breaks = seq(5, 50, by = 10), labels = seq(5, 50, by = 10), trans = "log10") #### 1.12 Fine tuning themes #### # black and white ggplot2 theme CO2.plot + theme_bw() # building your own theme mytheme <- theme_bw() + theme(plot.title = element_text(colour = "red")) + theme(legend.position = c(0.9, 0.9)) CO2.plot + mytheme # BONUS: ggtheme package if(!require(ggthemes)) {install.packages("ggthemes")} library(ggthemes) CO2.plot + theme_tufte() # base R plots plot(iris) lm <- lm(Sepal.Length~Petal.Width, data = iris) x11() plot(lm) # Bonus! - Ecologists who may become vegan users # install_github("ggvegan", "gavinsimpson") library(ggvegan) data(dune) data(dune.env) sol <- cca(dune ~ A1 + Management, data = dune.env) autoplot(sol) data(mite) data(mite.env) mite.hel = decostand(mite, "hel") rda <- rda(mite.hel ~ WatrCont + Shrub, mite.env) # Model with all explanatory variables x11() ggvegan.plot <- autoplot(rda) + theme_bw() normal.plot <- plot(rda) #------------------------------------------------------------------------------# #------------------------------------------------# #### 2. Using tidyr to manipulate data frames #### #------------------------------------------------# # Source materials: # tidyr #https://blog.rstudio.org/2014/07/22/introducing-tidyr/ #### 2.1 Why "tidy" your data? #### # Load and explore the "airquality" datasets ?airquality str(airquality) head(airquality) names(airquality) #### 2.2 Wide vs long data #### # We can use the "tidyr" package by Hadley Wickham to: # 1."gather" our data (wide --> long) # 2."spread" our data (long --> wide) # Example: Let's pretend you send out your field assistant to measure the # diameter at breast height (DBH) and height of three tree species for you. # They return with the following messy (wide) dataset: messy <- data.frame(Species = c("Oak", "Elm", "Ash"), DBH = c(12, 20, 13), Height = c(56, 85, 55)) messy #### 2.3 Gather: Making your data long #### ?gather # "gather()" the DBH and Height columns messy.long <- gather(messy, Measurement, cm, DBH, Height) messy.long # Let's try this with the C02 dataset. Here we might want to collapse the # last two quantitative variables "conc" and "uptake": CO2.long <- gather(CO2, response, value, conc, uptake) head(CO2) head(CO2.long) tail(CO2.long) #### 2.4 Spread: Making your data wide #### ?spread # spread uses the same syntax as gather (they are complements) messy.wide <- spread(messy.long, Measurement, cm) messy.wide #------------------------------------------------------------------------------# #-------------------------# #### tidyr Challenge 4 #### #-------------------------# # Using the ''airquality'' dataset, ''gather()'' all the columns # (except Month and Day) into rows. Then ''spread()'' the resulting # dataset to return the same data format as the original data. # SOLUTION: ?airquality names(airquality) air.long <- gather(airquality, variable, value, -Month, -Day) head(air.long) air.wide <- spread(air.long , variable, value) head(air.wide) # Now air.wide is back in the same format as the original airquality # (although the order of columns is changed) #------------------------------------------------------------------------------# # some times you might have really messy data which has two varaiables in # one column. Thankfully the separate function can (wait for it) # separate the two variables into two columns #### 2.5 separate(): Separate two (or more) variables in a single column #### # lets say you have this really messy data set set.seed(8) really.messy <- data.frame(id = 1:4, trt = sample(rep(c('control', 'farm'), each = 2)), zooplankton.T1 = runif(4), fish.T1 = runif(4), zooplankton.T2 = runif(4), fish.T2 = runif(4)) # first we want to convert this wide dataset to long really.messy.long <- gather(really.messy, taxa, count, -id, -trt) # then we want to split those two sampling time (T1 & T2). The syntax we use here is to tell R seperate(data, what column, into what, by what) # the tricky part here is telling R where to separate the character string in your column entry # using a regular expression to describe the character that separates them # here the string should be separated by the period (.) really.messy.long.sep <- separate(really.messy.long, taxa, into = c("species", "time"), sep = "\\.") #### 2.6 Combining ggplot with tidyr #### ##Example with the air quality dataset on using both wide and long data formats head(airquality) # The dataset is in wide format, where measured variables # (ozone, solar.r, wind and temp) are placed in their own columns. # Diagnostic plots using the wide format + ggplot2 # 1: Visualize each individual variable and the range it displays for each month in the timeseries fMonth <- factor(airquality$Month) #Convert the Month variable to a factor. ozone.box <- ggplot(airquality, aes(x = fMonth, y = Ozone)) + geom_boxplot() solar.box <- ggplot(airquality, aes(x = fMonth, y = Solar.R)) + geom_boxplot() temp.box <- ggplot(airquality, aes(x = fMonth, y = Temp)) + geom_boxplot() wind.box <- ggplot(airquality, aes(x = fMonth, y = Wind)) + geom_boxplot() # You can use grid.arrange() in the package gridExtra to put these plots into 1 figure. combo.box <- grid.arrange(ozone.box, solar.box, temp.box, wind.box, nrow = 2) # nrow = number of rows you would like the plots displayed on. # This arranges the 4 separate plots into one panel for viewing. # Note that the scales on the individual y-axes are not the same. # 2: You can continue using the wide format of the airquality dataset to make # individual plots of each variable showing day measurements for each month. ozone.plot <- ggplot(airquality, aes(x = Day, y = Ozone)) + geom_point() + geom_smooth() + facet_wrap(~ Month, nrow = 2) solar.plot <- ggplot(airquality, aes(x = Day, y = Solar.R)) + geom_point() + geom_smooth() + facet_wrap(~ Month, nrow = 2) wind.plot <- ggplot(airquality, aes(x = Day, y = Wind)) + geom_point() + geom_smooth() + facet_wrap(~ Month, nrow = 2) temp.plot <- ggplot(airquality, aes(x = Day, y = Temp)) + geom_point() + geom_smooth() + facet_wrap(~ Month, nrow = 2) # You could even then combine these different faceted plots together: # (though it looks pretty ugly at the moment) combo.facets <- grid.arrange(ozone.plot, solar.plot, wind.plot, temp.plot, nrow = 4) # BUT, what if I'd like to use facet_wrap() for the variables # as opposed to by month or put all variables on oneplot? air.long <- gather(airquality, variable, value, -Month, -Day) head(air.long) air.wide <- spread(air.long , variable, value) head(air.wide) # Use air.long fMonth.long <- factor(air.long$Month) weather <- ggplot(air.long, aes(x = fMonth.long, y = value)) + geom_boxplot() + facet_wrap(~ variable, nrow = 2) # Compare the "weather" plot with "combo.box" # This is the same data but working with it in wide versus long format has allowed us to make different looking plots. # The weather plot uses facet_wrap to put all the individual variables on the same scale. # This may be useful in many circumstances. However, using the facet_wrap means that # we don't see all the variation present in the wind variable. # In that case, you can modify the code to allow the scales to be determined per facet. weather <- weather + facet_wrap(~ variable, nrow = 2, scales = "free") weather # We can also use the long format data (air.long) to create a plot with # all the variables included on a single plot: weather2 <- ggplot(air.long, aes(x = Day, y = value, colour = variable)) + geom_point() + #this plot will put all the day measurements on one plot facet_wrap(~ Month, nrow = 1) #add this part and again, the observations are split by month weather2 #------------------------------------------------------------------------------# #---------------------------------------# #### 3. Data manipulation with dplyr #### #---------------------------------------# ## MEGA DATA MANIPULATION ## #### 3.1 Intro - the dplyr mission #### #### 3.2 Basic dplyr functions #### # Select a subset of columns with select() ozone <- select(airquality, Ozone, Month, Day) head(ozone) # Select a subset of rows with filter() august <- filter(airquality, Month == 8, Temp >= 90) head(august) # Sort columns with arrange() air_mess <- sample_frac(airquality, 1) head(air_mess) air_chron <- arrange(air_mess, Month, Day) head(air_chron) # Create and populate columns with mutate() airquality_C <- mutate(airquality, Temp_C = (Temp-32)*(5/9)) head(airquality_C) #### 3.3 dplyr and magrittr, a match made in heaven #### # two steps wrapped june_C <- mutate(filter(airquality, Month == 6), Temp_C = (Temp-32)*(5/9)) # steps linked using magrittr june_C <- airquality %>% filter(Month == 6) %>% mutate(Temp_C = (Temp-32)*(5/9)) #### 3.4 dplyr - Summaries and grouped operations #### month_sum <- airquality %>% group_by(Month) %>% summarise(mean_temp = mean(Temp), sd_temp = sd(Temp)) month_sum #------------------------------------------------------------------------------# #### dplyr Challenge # 5 #### # Using the ChickWeight dataset, create a summary table which displays the # difference in weight between the maximum and minimum weight of each chick # in the study. Employ dplyr verbs and the %>% operator. weight_gain <- ChickWeight %>% group_by(Chick) %>% summarise(weight_gain = max(weight) - min(weight)) weight_gain #------------------------------------------------------------------------------# #### dplyr Ninja Challenge # 6 #### # Using the ChickWeight dataset, create a summary table which displays, for # each diet, the average individual difference in weight between the end and # the beginning of the study. Employ dplyr verbs and the %>% operator. # (Hint: first() and last() may be useful here.) diet_summ <- ChickWeight %>% group_by(Diet, Chick) %>% summarise(weight_gain = max(weight) - min(weight)) %>% summarise(mean_gain = mean(weight_gain)) diet_summ #------------------------------------------------------------------------------#