# Use '# symbol' to denote comments in scripts. The '# symbol' tells R to ignore anything remaining on a 
# given line of the script when running commands. 
# Since comments are ignored when running script, they allow you to leave yourself notes in your code 
# or tell collaborators what you did. A script with comments is a good step towards reproducible science 
# and annotating someone's script is a good way to learn. 


# It is recommended that you use comments to put a header at the beginning of your script
# with essential information: project name, author, date, version of R.

## QCBS R Workshop ##
## Workshop 2 - Loading and manipulating data
## Author: Quebec Center for Biodiversity Science
## Date: Fall 2018

if(!require(tidyr)){install.packages("tidyr")}
require(tidyr)

if(!require(dplyr)){install.packages("dplyr")}
require(dplyr)

if(!require(magrittr)){install.packages("magrittr")}
require(magrittr)


# Heading name
# You can use four # signs in a row to create section headings to help organize your script.
# For example:

#### Housekeeping ####

# Notice the small arrow next to the line number of the section heading we just created. 
# If you click on it, you hide this section of the script.

# It is good practice to have a command at the top of your script to clear R memory. This will help prevent errors such as using old data that has been left in your workspace. The command rm(list=ls()) will clear memory. 

rm(list=ls())  # Clears R workspace
?rm
?ls

# Remember: R is ready for commands when you see the chevron '>'.
# If the chevron isn't displayed, it means you typed an incomplete command and it is waiting for more input.  Press "Escape" to exit and get R ready for a new command.

A<-"Test"     # Put some data into workspace, to see how rm(list=ls()) removes it
A <- "Test"   # Note that you can use a space before or after <-
A = "Test"    # <- or = can be used equally
## Best practice is to use <- for assignment instead of the "=" sign
A
rm(list=ls())
A

# Remember that R is case sensitive. i.e. "A" is a different object than "a" 
a<-10  
A<-5
a
A

rm(list=ls())  # Clears R workspace again

#### LOADING DATA ####

getwd() # This commands shows the directory you are currently working in

# You can type the path of the directory in the brackets of the command setwd().
setwd('/Users/vincentfugere/Desktop/QCBS_R_Workshop2') # Mac example
setwd('C:/Users/Johanna/Documents/PhD/R_Workshop2')   # Windows Example
# **Note that this path will NOT work on your computer!

# Or you can use choose.dir() to get a pop up to navigate to the appropriate directory.
setwd(choose.dir()) # This may not work on a Mac.

CO2<-read.csv("CO2_good.csv") # Create an object called CO2 by loading data from a file called "CO2_good.csv"
CO2<-read.csv(file.choose()) # Alternatively, you can choose the file to load interactively using this command 

?read.csv # Use the question mark to pull up the help page for a command  

CO2<-read.csv("CO2_good.csv", header = TRUE) 
# Adding header = TRUE tells R that the first line of the spreadsheet contains column names and not data

# NOTE: if you have a french OS or CSV editor and read.csv does not work, try read.csv2 instead

#### LOOKING AT DATA ####

CO2 # Look at the whole dataframe
head(CO2) # Look at the first few rows
names(CO2) # Names of the columns in the dataframe
attributes(CO2) # Attributes of the dataframe
ncol(CO2) # Number of columns
nrow(CO2) # Number of rows
summary(CO2) # Summary statistics

str(CO2) # Structure of the dataframe 
# Useful to check mode of all columns, i.e. to check that all factors are factors and continuous data is integer or numeric

plot(CO2) # Plot of all variable combinations

# Is the response variable normally distributed? Try:
hist(CO2$uptake) # Remember that $ is used to extract a specific column from a dataframe

conc_mean<-mean(CO2$conc) # Calculate mean of the "conc" column of the "CO2" object. Save as "conc_mean"
conc_mean # Display object "conc_mean"
# The concentration mean is 435.

conc_sd<-sd(CO2$conc) # Calculate sd of "conc" column and save as "conc_sd"
conc_sd
# The concentration standard deviation is 295.92.

# Want to calculate mean or sd of all columns at once? Try apply()
?apply
apply(CO2[,4:5], MARGIN = 2, FUN = mean) # calculate mean of the two columns in the dataframe that contain continuous data

## Save your workspace ##

save.image(file="CO2_project_Data.RData") # Save workspace

rm(list=ls())  # Clears R workspace

load("CO2_project_Data.RData") # Reload everything that was in your workspace

head(CO2) # Looking good :)

write.csv(CO2,file="CO2_new.csv") # Save object CO2 to a file named CO2_new.csv

#### CHALLENGE: FIXING BROKEN dataframe ####

# Read a broken CO2 csv file into R and find the problems

CO2<-read.csv("CO2_broken.csv") # Overwrite CO2 object with broken CO2 data

## What are the problems?  Hint: There are 4.

## Useful functions

# Note: for these functions, you have to put the name of the data object in the parantheses (i.e. head(CO2)).
# Also remember that you can use "?" to look up help for a function (i.e. ?str).

?read.csv
head() 
str()  
class()
unique()
levels()
which()
droplevels()

#### ANSWERS BELOW-- No peaking!  ###
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#

## Broken CO2 data Problems ##

## Problem #1: the data appears to be lumped into one column.

# Re-import the data, but specify the separation among entries .
# The sep argument tells R what character separates the values on each line of the file.
# Here, "TAB" was used instead of ",".
CO2 <- read.csv("CO2_broken.csv",sep = "")
?read.csv

## Problem #2: the data does not start until the third line of the txt file, so you end up with notes on the file as the headings.

head(CO2) # The head() command allows you to see that the data has not been read in with the proper headings
# To fix this problem, you can tell R to skip the first two rows when reading in this file.
CO2<-read.csv("CO2_broken.csv",sep = "",skip=2)  # By adding the skip argument into the read.csv function, R knows to skip the first two rows
head(CO2) # You can now see that the CO2 object has the appropriate headings

## Problem #3: "conc" and "uptake" variables are considered factors instead of numbers, because there are comments/text in the numeric columns.

str(CO2) # The str() command shows you that both 'conc' and 'uptake' are labelled as factors
class(CO2$conc)
unique(CO2$conc) # By looking at the unique values in this column, you see that it contains "cannot_read_notes" 
unique(CO2$uptake) #This column contains both "cannot_read_notes" and "na"
?unique

CO2 <- read.csv("CO2_broken.csv",sep = "",skip = 2,na.strings = c("NA","na","cannot_read_notes")) 
# By identifying "cannot_read_notes" and "na" as NA data, R reads these columns properly.
# Remember that NA stands for not available.
head(CO2)
str(CO2) # You can see that the conc variable is now an integer and the uptake variable is now treated as numeric

## Problem #4: There are only two treatments (chilled and nonchilled) but there are spelling errors causing it to look like 4 different treatments.

str(CO2) # You can see that 4 levels are listed for Treatment
levels(CO2$Treatment)
unique(CO2$Treatment) # The 4 different treatments are "nonchilled", "nnchilled", "chilled", and "chiled"  

# You can use which() to find rows with the typo "nnchilled".
which(CO2$Treatment=="nnchilled") # Row number ten
# You can then correct the error using indexing:
CO2$Treatment[10]="nonchilled"
# Alternatively, doing it with a single command:
CO2$Treatment[which(CO2$Treatment=="nnchilled")]="nonchilled"
# Now doing the same for "chiled":
CO2$Treatment[which(CO2$Treatment=="chiled")]="chilled" 

# Have we fixed the problem?
str(CO2)  # Structure still identifies 4 levels of the factor
unique(CO2$Treatment) # But, unique says that only two are used
CO2<-droplevels(CO2) # This command drops the unused levels from all factors in the dataframe
str(CO2) # Fixed!


#------------------------------------------------------------------------------#
#------------------------------------------------#
#### 2. Using tidyr to manipulate data frames ####
#------------------------------------------------#

# Source materials:

# tidyr
#https://blog.rstudio.org/2014/07/22/introducing-tidyr/

#### 2.1 Why "tidy" your data? ####

# Load and explore the "airquality" datasets
?airquality
str(airquality)
head(airquality)
names(airquality)

#### 2.2 Wide vs long data ####

# We can use the "tidyr" package by Hadley Wickham to:
# 1."gather" our data (wide --> long)
# 2."spread" our data (long --> wide)

# Example: Let's pretend you send out your field assistant to measure the
# diameter at breast height (DBH) and height of three tree species for you.
# They return with the following messy (wide) dataset:
messy <- data.frame(Species = c("Oak", "Elm", "Ash"),
                    DBH = c(12, 20, 13),
                    Height = c(56, 85, 55))
messy

#### 2.3 Gather: Making your data long ####
?gather

# "gather()" the DBH and Height columns
messy.long <- gather(messy, Measurement, cm, DBH, Height)
messy.long

# Let's try this with the C02 dataset. Here we might want to collapse the 
# last two quantitative variables "conc" and "uptake":
CO2.long <- gather(CO2, response, value, conc, uptake)  
head(CO2) 
head(CO2.long) 
tail(CO2.long)

#### 2.4 Spread: Making your data wide ####
?spread 

# spread uses the same syntax as gather (they are complements)
messy.wide <- spread(messy.long, Measurement, cm)
messy.wide

#------------------------------------------------------------------------------#
#-------------------------#
#### tidyr Challenge 4 ####
#-------------------------#

# Using the ''airquality'' dataset, ''gather()'' all the columns
# (except Month and Day) into rows. Then ''spread()'' the resulting
# dataset to return the same data format as the original data.

# SOLUTION: 
?airquality
names(airquality)
air.long <- gather(airquality, variable, value, -Month, -Day)
head(air.long)
air.wide <- spread(air.long , variable, value)
head(air.wide)

# Now air.wide is back in the same format as the original airquality
# (although the order of columns is changed)
#------------------------------------------------------------------------------#


# some times you might have really messy data which has two varaiables in
# one column. Thankfully the separate function can (wait for it) 
# separate the two variables into two columns 

#### 2.5 separate(): Separate two (or more) variables in a single column ####

# lets say you have this really messy data set 
set.seed(8)
really.messy <- data.frame(id = 1:4,
                           trt = sample(rep(c('control', 'farm'), each = 2)),
                           zooplankton.T1 = runif(4),
                           fish.T1 = runif(4),
                           zooplankton.T2 = runif(4),
                           fish.T2 = runif(4))

# first we want to convert this wide dataset to long 
really.messy.long <- gather(really.messy, taxa, count, -id, -trt)

# then we want to split those two sampling time (T1 & T2). The syntax we use here is to tell R seperate(data, what column, into what, by what)
# the tricky part here is telling R where to separate the character string in your column entry 
# using a regular expression to describe the character that separates them
# here the string should be separated by the period (.)
really.messy.long.sep <- separate(really.messy.long, taxa, into = c("species", "time"), sep = "\\.") 

#### 2.6 Combining ggplot with tidyr ####

##Example with the air quality dataset on using both wide and long data formats 
head(airquality)
# The dataset is in wide format, where measured variables
# (ozone, solar.r, wind and temp) are placed in their own columns.


#------------------------------------------------------------------------------#
#---------------------------------------#
#### 3. Data manipulation with dplyr ####
#---------------------------------------#

## MEGA DATA MANIPULATION ##

#### 3.1 Intro - the dplyr mission ####

#### 3.2 Basic dplyr functions ####

# Select a subset of columns with select()
ozone <- select(airquality, Ozone, Month, Day)
head(ozone)

# Select a subset of rows with filter()
august <- filter(airquality, Month == 8, Temp >= 90)
head(august)

# Sort columns with arrange()
air_mess <- sample_frac(airquality, 1)
head(air_mess)

air_chron <- arrange(air_mess, Month, Day)
head(air_chron)

# Create and populate columns with mutate()
airquality_C <- mutate(airquality, Temp_C = (Temp-32)*(5/9))
head(airquality_C)


#### 3.3 dplyr and magrittr, a match made in heaven ####

# two steps wrapped
june_C <- mutate(filter(airquality, Month == 6), Temp_C = (Temp-32)*(5/9))

# steps linked using magrittr
june_C <- airquality %>%
  filter(Month == 6) %>%
  mutate(Temp_C = (Temp-32)*(5/9)) 

#### 3.4 dplyr - Summaries and grouped operations ####
month_sum <- airquality %>% 
  group_by(Month) %>% 
  summarise(mean_temp = mean(Temp),
            sd_temp = sd(Temp)) 
month_sum

#------------------------------------------------------------------------------#
#### dplyr Challenge # 5 ####
# Using the ChickWeight dataset, create a summary table which displays the
# difference in weight between the maximum and minimum weight of each chick
# in the study. Employ dplyr verbs and the %>% operator.
weight_gain <- ChickWeight %>% 
  group_by(Chick) %>% 
  summarise(weight_gain = max(weight) - min(weight))
weight_gain

#------------------------------------------------------------------------------#
#### dplyr Ninja Challenge # 6 ####
# Using the ChickWeight dataset, create a summary table which displays, for
# each diet, the average individual difference in weight between the end and
# the beginning of the study. Employ dplyr verbs and the %>% operator.
# (Hint: first() and last() may be useful here.)
diet_summ <- ChickWeight %>% 
  group_by(Diet, Chick) %>% 
  summarise(weight_gain = max(weight) - min(weight)) %>%
  summarise(mean_gain = mean(weight_gain))
diet_summ

#------------------------------------------------------------------------------#