# Name: Amanda Winegardner, Xavier Giroux-Bougard, Bérenger Bourgeois, Emmanuelle Chrétien and Monica Granados
# Date: February 2016
# Description: Ordination
# Dataset: File names are "DoubsSpe.csv", "DoubsEnv.csv"
# Notes: Material in R script obtained from
#        Borcard D., Gillet F. et Legendre P., 2011. Numerical Ecology with R. Springer.
#***************************************************************************************#


# 0. Loading the data ####

rm(list=ls())
install.packages("vegan")
install.packages("gclus")
install.packages("ape")
library(vegan)
library(gclus)
library(ape)
source(file.choose()) # fichier coldiss.R

# Species community data frame (fish abundance): "DoubsSpe.csv"
spe <- read.csv(file.choose(), row.names=1)
spe <- spe[-8,] # site number 8 contains no species and must be removed

# Environmental data frame: "DoubsEnv.csv"
env <- read.csv(file.choose(), row.names=1)
env <- env[-8,]

#***************************************************************************************#

# 1.Explore the data ####

## 1.1 Species data ####
names(spe)
dim(spe)
str(spe)
head(spe)
summary(spe)

### Species distibution, all species confounded
(ab <- table(unlist(spe)))
barplot(ab, las=1, xlab="Abundance class", ylab="Frequency", col=grey(5:0/5))

### Number of absences
sum(spe==0)

### Proportion of zeros in the community data set
sum(spe==0)/(nrow(spe)*ncol(spe))

### Species occurences
spe.pres <- colSums(spe>0) # compute the number of sites where each species is present
hist(spe.pres, main="Species occurence", las=1, xlab="Frequency of occurences", ylab="Number of species", breaks=seq(0,30, by=5), col="grey")

### Sites richness
site.pres <- rowSums(spe>0) # compute the number of species at ecah site
hist(site.pres, main="Species richness", las=1, xlab="Frequency of sites", ylab="Number of species", breaks=seq(0,30, by=5), col="grey")

### Compute diversity indices
?diversity
(H <- diversity(spe, index="shannon")) # Shannon diversity index
(N <- diversity(spe, index="simpson")) # Simpson diversity index

## 1.2 Environmental data ####
names(env)
dim(env)
str(env)
head(env)
summary(env)
pairs(env, main="Bivariate Plots of the Environmental Data" )

# Standardization of the 11 environmental data
env.z <- decostand(env, method="standardize")
apply(env.z, 2, mean) # the data are now centered (means~0)
apply(env.z, 2, sd)   # the data are now scaled (standard deviations=1)

# Note: explanatory variables expressed in different units
#       must obligatory be standardized before computing distances measures and ordination analysis.

#***************************************************************************************#

# 2.Association measures ####

## 2.1. Distance measures ####

### Quantitative species data
?vegdist # this function compute dissimilarity indices useful for community decomposition data (see method options)
(spe.db<-vegdist(spe, method="bray")) # Bray-Curtis dissimilarity
(spe.dj<-vegdist(spe, method="jac")) # Jaccard dissimilarity
(spe.dg<-vegdist(spe, method="gower")) # Gower dissimilarity

### Presence-Absence species data
(spe.db.pa<-vegdist(spe, method="bray", binary=TRUE)) #Bray-Curtis dissimilarity
(spe.dj.pa<-vegdist(spe, method="jac", binary=TRUE)) # Jaccard dissimilarity
(spe.dg.pa<-vegdist(spe, method="gower", binary=TRUE)) # Gower dissimilarity

### Graphical display of association matrices: heat maps
windows() #use x11() or quartz() on Mac
coldiss(spe.db, byrank=FALSE, diag=TRUE) # Heat map of Bray-Curtis dissimilarity
windows()
coldiss(spe.dj, byrank=FALSE, diag=TRUE) # Heat map of Jaccard dissimilarity
windows()
coldiss(spe.dg, byrank=FALSE, diag=TRUE) # Heat map of Gower dissimilarity

### Quantitative environmental data
#### Association between environmental data (Q mode)
?dist # this function also compute dissimilarity matrix
env.de <- dist(env.z, method = "euclidean") # euclidean distance matrix of the standardized environmental variables
windows()
coldiss(env.de, diag=TRUE)

### Dependance between environmental data (R mode)
(env.pearson<-cor(env)) # Pearson r linear correlation
round(env.pearson, 2)
(env.ken<-cor(env, method="kendall")) # Kendall tau rank correlation
round(env.ken, 2)

### Mixed types of environmental variables
#### Association between mixed types of environmental data (Q mode)
##### Create a fictious data frame
var.g1 <- rnorm(30, 0, 1)
var.g2 <- runif(30, 0, 5)
var.g3 <- gl(3, 10)
var.g4 <- gl(2, 5, 30)
(dat2 <- data.frame(var.g1, var.g2, var.g3, var.g4))
str(dat2)
summary(dat2)
##### Compute the Gower dissimilarity matrix
?daisy
(dat2.dg <- daisy(dat2, metric="gower"))
coldiss(dat2.dg)

# Challenge 1 Advanced ####

# Calculate by hand the Bray-Curtis and the Gower dissimilarity of species abundance CHA, TRU and VAI for sites 1, 2 and 3
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# Bray-Curtis dissimilarity : d[jk] = (sum abs(x[ij]-x[ik]))/(sum (x[ij]+x[ik]))
(spe.challenge<-spe[1:3,1:3])
# Calculate the total species abundance by site
(Abund.s1<-sum(spe.challenge[1,]))
(Abund.s2<-sum(spe.challenge[2,]))
(Abund.s3<-sum(spe.challenge[3,]))
# Calculate the species abundance differences between pairs of sites for each species
Spec.s1s2<-0
Spec.s1s3<-0
Spec.s2s3<-0
for (i in 1:3) {
  Spec.s1s2<-Spec.s1s2+abs(sum(spe.challenge[1,i]-spe.challenge[2,i]))
  Spec.s1s3<-Spec.s1s3+abs(sum(spe.challenge[1,i]-spe.challenge[3,i]))
  Spec.s2s3<-Spec.s2s3+abs(sum(spe.challenge[2,i]-spe.challenge[3,i])) }
# Calculate the Bray-Curtis dissimilarity
(db.s1s2<-Spec.s1s2/(Abund.s1+Abund.s2))
(db.s1s3<-Spec.s1s3/(Abund.s1+Abund.s3))
(db.s2s3<-Spec.s2s3/(Abund.s2+Abund.s3))
# Compare your results
(spe.db.challenge<-vegdist(spe.challenge, method="bray"))

# Gower dissimilarity :    d[jk] = (1/M) sum(abs(x[ij]-x[ik])/(max(x[i])-min(x[i])))
# Calculate the number of columns in your dataset
M<-ncol(spe.challenge)
# Calculate the species abundance differences between pairs of sites for each species
Spe1.s1s2<-abs(spe.challenge[1,1]-spe.challenge[2,1])
Spe2.s1s2<-abs(spe.challenge[1,2]-spe.challenge[2,2])
Spe3.s1s2<-abs(spe.challenge[1,3]-spe.challenge[2,3])
Spe1.s1s3<-abs(spe.challenge[1,1]-spe.challenge[3,1])
Spe2.s1s3<-abs(spe.challenge[1,2]-spe.challenge[3,2])
Spe3.s1s3<-abs(spe.challenge[1,3]-spe.challenge[3,3])
Spe1.s2s3<-abs(spe.challenge[2,1]-spe.challenge[3,1])
Spe2.s2s3<-abs(spe.challenge[2,2]-spe.challenge[3,2])
Spe3.s2s3<-abs(spe.challenge[2,3]-spe.challenge[3,3])
# Calculate the range of each species abundance between sites
Range.spe1<-max(spe.challenge[,1]) - min (spe.challenge[,1])
Range.spe2<-max(spe.challenge[,2]) - min (spe.challenge[,2])
Range.spe3<-max(spe.challenge[,3]) - min (spe.challenge[,3])
# Calculate the Gower dissimilarity
(dg.s1s2<-(1/M)*((Spe2.s1s2/Range.spe2)+(Spe3.s1s2/Range.spe3)))
(dg.s1s3<-(1/M)*((Spe2.s1s3/Range.spe2)+(Spe3.s1s3/Range.spe3)))
(dg.s2s3<-(1/M)*((Spe2.s2s3/Range.spe2)+(Spe3.s2s3/Range.spe3)))
# Compare your results
(spe.db.challenge<-vegdist(spe.challenge, method="gower"))


## 2.2. Transformations for community composition data ####

?decostand # this function provide some standardiztion methods for community domposition data (see method options)
# Transform abundances to absence-presence (1-0)
(spe.pa<-decostand(spe, method="pa"))

# Hellinger transformation
(spe.hel<-decostand(spe, method="hellinger"))

# Chi.square transformation
(spe.chi<-decostand(spe, method="chi.square"))


# Challenge 2 Advanced ####

# Calculate by hand the Hellinger and the Chi-square transformed abundances data
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# Hellinger tranformation
# First calculate the total species abundances by site
(site.totals<-apply(spe, 1, sum))
# Scale species abundances by dividing them by site totals
(scale.spe<-spe/site.totals)
# Calculate de square root of scaled species abundances
(sqrt.scale.spe<-sqrt(scale.spe))
# Compare the results
sqrt.scale.spe
spe.hel
sqrt.scale.spe-spe.hel # or: sqrt.scale.spe/spe.hel

# Chi-squre transformation
# First calculate the total species abundances by site
(site.totals<-apply(spe, 1, sum))
# Then calculate the square root of total species abundances
(sqrt.spe.totals<-sqrt(apply(spe, 2, sum)))
# Scale species abundances by dividing them by the site totals and the species totals
scale.spe2<-spe
for (i in 1:nrow(spe)) {
  for (j in 1:ncol(spe)) {
    (scale.spe2[i,j]<-scale.spe2[i,j]/(site.totals[i]*sqrt.spe.totals[j]))   }}
#Adjust the scale abundance species by multplying by the square root of the species matrix total
(adjust.scale.spe2=scale.spe2*sqrt(sum(rowSums(spe))))
# Compare the results
adjust.scale.spe2
spe.chi
adjust.scale.spe2-spe.chi # or: adjust.scale.spe2/spe.chi


## 2.3 Clustering ####

### Calculating Hellinger distance
spe.dhel<-vegdist(spe.hel,method="euclidean") #generates the distance matrix from Hellinger transformed data

# See difference between the two matrices
head(spe.hel)# Hellinger-transformed species data
head(spe.dhel)# Hellinger distances among sites

### Comparison of single and complete linkage clustering

#Perform single linkage clustering
spe.dhel.single<-hclust(spe.dhel, method=”single”)
plot(spe.dhel.single)

#Perform complete linkage clustering
spe.dhel.complete<-hclust(spe.dhel, method=”complete”)
plot(spe.dhel.complete)

### Ward's minimum variance method

# Perform Ward minimum variance clustering
spe.dhel.ward<-hclust(spe.dhel, method=”ward.D2”)
plot(spe.dhel.ward)

# Re-plot the dendrogram by using the square roots of the fusion levels
spe.dhel.ward$height<-sqrt(spe.dhel.ward$height)
plot(spe.dhel.ward)
plot(spe.dhel.ward, hang=-1) # hang=-1 aligns all objets on the same line


#***************************************************************************************#

# 3.Unconstrained Ordination ####

## 3.1. Principal Components Analysis (PCA) ####

### PCA on hellinger-transformed species data
?rda
#### Run the PCA
spe.h.pca <- rda(spe.hel)
# Extract the results
summary(spe.h.pca)

#### Extract specific parts of the results
summary(spe.h.pca) # overall results
summary(spe.h.pca, display=NULL) # only eigenvalues and their contribution to the variance
eigen(cov(spe.hel)) # also compute the eigenvalues
(spe.scores <- scores(spe.h.pca, display="species", choices=c(1,2))) # species scores on the first two PCA axes
(site.scores <- scores(spe.h.pca, display="sites", choices=c(1,2))) # sites scores on the first two PCA axes
#Note: if you don’t specify the number of principal components to extract (e.g. choices=c(1,2) or choices=c(1:2) then all of the scores will be extracted for all of the principal components.

# Identify the significant axis using the Kaiser-Guttman criterion
ev <- spe.h.pca$CA$eig
ev[ev>mean(ev)]
n <- length(ev)
barplot(ev, main="Eigenvalues", col="grey", las=2)
abline(h=mean(ev), col="red")
legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")

### PCA on the environmental variables
#### Run the PCA
env.pca <- rda(env.z) # or rda(env, scale=TRUE)

#### Extract the results
summary(env.pca)
summary(env.pca, scaling=2)

# Identify the significant axis using the Kaiser-Guttman criterion
ev <- env.pca$CA$eig
ev[ev>mean(ev)]
n <- length(ev)
barplot(ev, main="Eigenvalues", col="grey", las=2)
abline(h=mean(ev), col="red")
legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")


### Construct a biplot
#### Biplot of the PCA on transformed species data (scaling 1)
windows()
plot(spe.h.pca)
windows()
?biplot
biplot(spe.h.pca)
windows()
plot(spe.h.pca, scaling=1, type="none", # scaling 1 = distance biplot :
     # distances among abjects in the biplot approximate their Euclidean distances
     # but angles among descriptor vectors DO NOT reflect their correlation
     xlab=c("PC1 (%)", round((spe.h.pca$CA$eig[1]/sum(spe.h.pca$CA$eig))*100,2)),
     ylab=c("PC2 (%)", round((spe.h.pca$CA$eig[2]/sum(spe.h.pca$CA$eig))*100,2)))
points(scores(spe.h.pca, display="sites", choices=c(1,2), scaling=1),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(spe.h.pca, display="species", choices=c(1), scaling=1),
     scores(spe.h.pca, display="species", choices=c(2), scaling=1),
     labels=rownames(scores(spe.h.pca, display="species", scaling=1)),
     col="red", cex=0.8)

# Add arrows for species scores
spe.sc<-scores(spe.h.pca,choices=1:2,scaling=,display="sp")
arrows(0,0,spe.sc[,1],spe.sc[,2],length=0)

#### Biplot on the environmental variables (scaling 2)
windows()
plot(env.pca)
windows()
plot(env.pca, scaling=2, type="none", # scaling 2 = correlation biplot :
     # distances among abjects in the biplot DO NOT approximate their Euclidean distances
     # but angles among descriptor vectors reflect their correlation
     xlab=c("PC1 (%)", round((env.pca$CA$eig[1]/sum(env.pca$CA$eig))*100,2)),
     ylab=c("PC2 (%)", round((env.pca$CA$eig[2]/sum(env.pca$CA$eig))*100,2)),
     xlim=c(-1,1), ylim=c(-1,1))
points(scores(env.pca, display="sites", choices=c(1,2), scaling=2),
       pch=21, col="black", bg="darkgreen", cex=1.2)
text(scores(env.pca, display="species", choices=c(1), scaling=2),
     scores(env.pca, display="species", choices=c(2), scaling=2),
     labels=rownames(scores(env.pca, display="species", scaling=2)),
     col="red", cex=0.8)


#### Abundance of species TRU along an oligotrophic-eutrophic gradient
Sites_scores_Env_Axis1<- scores(env.pca, display="sites", choices=c(1), scaling=2)
spe$ANG
plot( Sites_scores_Env_Axis1, spe$TRU)
summary(lm(spe$TRU~Sites_scores_Env_Axis1))
abline(lm(spe$TRU~Sites_scores_Env_Axis1))

# Challenge 3 ####
# Run the PCA of the mite species abundances
# Which are the significant axes ?
# Which group of sites can you identify ?
# Which species are related to each group of sites
data(mite)
mite.spe <- mite #data from the vegan package
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
mite.spe.hel<-decostand(mite.spe, method="hellinger")
mite.spe.h.pca<-rda(mite.spe.hel)
ev<-mite.spe.h.pca$CA$eig
ev[ev>mean(ev)]
n=length(ev)
barplot(ev, main="Eigenvalues", col="grey", las=2)
abline(h=mean(ev), col="red")
legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")
summary(mite.spe.h.pca, display=NULL)
windows()
plot(mite.spe.h.pca, scaling=1, type="none",
     xlab=c("PC1 (%)", round((mite.spe.h.pca$CA$eig[1]/sum(mite.spe.h.pca$CA$eig))*100,2)),
     ylab=c("PC2 (%)", round((mite.spe.h.pca$CA$eig[2]/sum(mite.spe.h.pca$CA$eig))*100,2)))
points(scores(mite.spe.h.pca, display="sites", choices=c(1,2), scaling=1),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(mite.spe.h.pca, display="species", choices=c(1), scaling=1),
     scores(mite.spe.h.pca, display="species", choices=c(2), scaling=1),
     labels=rownames(scores(mite.spe.h.pca, display="species", scaling=1)),
     col="red", cex=0.8)

## 3.2. Correspondence Analysis (CA) ####
?cca
### Run the CA on species abundances
spe.ca<-cca(spe)

### Identify the significant axis using the Kaiser-Guttman criterion
ev<-spe.ca$CA$eig
ev[ev>mean(ev)]
n=length(ev)
barplot(ev, main="Eigenvalues", col="grey", las=2)
abline(h=mean(ev), col="red")
legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")

### Extract the results
spe.ca
summary(spe.ca, display=NULL)

### Construct a biplot
par(mfrow=c(1,2))
#### scaling 1
plot(spe.ca, scaling=1, type="none", main='CA - biplot scaling 1',
     xlab=c("CA1 (%)", round((spe.ca$CA$eig[1]/sum(spe.ca$CA$eig))*100,2)),
     ylab=c("CA2 (%)", round((spe.ca$CA$eig[2]/sum(spe.ca$CA$eig))*100,2)))
points(scores(spe.ca, display="sites", choices=c(1,2), scaling=1),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(spe.ca, display="species", choices=c(1), scaling=1),
     scores(spe.ca, display="species", choices=c(2), scaling=1),
     labels=rownames(scores(spe.ca, display="species", scaling=1)),
     col="red", cex=0.8)
#### scaling 2
plot(spe.ca, scaling=1, type="none", main='CA - biplot scaling 2',
     xlab=c("CA1 (%)", round((spe.ca$CA$eig[1]/sum(spe.ca$CA$eig))*100,2)),
     ylab=c("CA2 (%)", round((spe.ca$CA$eig[2]/sum(spe.ca$CA$eig))*100,2)), ylim=c(-2,3))
points(scores(spe.ca, display="sites", choices=c(1,2), scaling=2),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(spe.ca, display="species", choices=c(1), scaling=2),
     scores(spe.ca, display="species", choices=c(2), scaling=2),
     labels=rownames(scores(spe.ca, display="species", scaling=2)),
     col="red", cex=0.8)


# Challenge 4 ####
# Run the CA of the mite species abundances
# Which are the significant axes ?
# Which group of sites can you identify ?
# Which species are related to each group of sites ?
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
mite.spe.ca<-cca(mite.spe)
ev<-mite.spe.ca$CA$eig
ev[ev>mean(ev)]
n=length(ev)
barplot(ev, main="Eigenvalues", col="grey", las=2)
abline(h=mean(ev), col="red")
legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")
summary(mite.spe.ca, display=NULL)
windows()
plot(mite.spe.ca, scaling=1, type="none",
     xlab=c("PC1 (%)", round((mite.spe.ca$CA$eig[1]/sum(mite.spe.ca$CA$eig))*100,2)),
     ylab=c("PC2 (%)", round((mite.spe.ca$CA$eig[2]/sum(mite.spe.ca$CA$eig))*100,2)))
points(scores(mite.spe.ca, display="sites", choices=c(1,2), scaling=1),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(mite.spe.ca, display="species", choices=c(1), scaling=1),
     scores(mite.spe.ca, display="species", choices=c(2), scaling=1),
     labels=rownames(scores(mite.spe.ca, display="species", scaling=1)),
     col="red", cex=0.8)


## 3.3. Principal Coordinate Anaysis (PCoA) ####
### Run the PCoA on hellinger transformed species abundances
?cmdscale
cmdscale(dist(spe.hel), k=(nrow(spe)-1), eig=TRUE)
?pcoa
spe.h.pcoa<-pcoa(dist(spe.hel))
### Extract the results
spe.h.pcoa
### Construct the biplot
windows()
biplot.pcoa(spe.h.pcoa, spe.hel)

### Run the PCoA on a Bray-Curtis dissimilarity matrix
spe.db
spe.bray.pcoa<-pcoa(spe.db, correction="cailliez")
spe.bray.pcoa
windows()
biplot.pcoa(spe.bray.pcoa)
# The distance measure choosen thus strongly influenced the PCoA results

# Challenge 5 ####
# Run the PCoA of the hellinger-transformed mite species abundances
# Which are the significant axes ?
# Which group of sites can you identify ?
# Which species are related to each group of sites ?
# How does the PCoA results compare with the PCA
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
mite.spe.h.pcoa<-pcoa(dist(mite.spe.hel))
mite.spe.h.pcoa
windows()
biplot.pcoa(mite.spe.h.pcoa, mite.spe.hel)


## 3.4. Nonmetric Multidimensional Scaling (NMDS) ####
?metaMDS
### Run the NMDS of species abundances based on the Bray-Curtis distance in 2 dimensions
spe.nmds<-metaMDS(spe, distance='bray', k=2) # argument k precise the number of dimensions choosen (i.e. the number of ordination axis)

### Extract the results
spe.nmds
spe.nmds$stress

### Assess the goodness of fit
spe.nmds$stress
stressplot(spe.nmds, main='Shepard plot')

### Construct the biplot
windows()
plot(spe.nmds, type="none", main=paste('NMDS/Bray - Stress=', round(spe.nmds$stress, 3)),
     xlab=c("NMDS1"),
     ylab=c("NMDS2"))
points(scores(spe.nmds, display="sites", choices=c(1,2)),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(spe.nmds, display="species", choices=c(1)),
     scores(spe.nmds, display="species", choices=c(2)),
     labels=rownames(scores(spe.nmds, display="species")),
     col="red", cex=0.8)

# Challenge 6 ####
# Run the NMDS of the mite species abundances in 2 dimensions based on a Bray-Curtis distance
# Assess the goodness-of-fit of the ordination and interpret the biplot
# (answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
mite.spe.nmds<-metaMDS(mite.spe, distance='bray', k=2)
### Extract the results
mite.spe.nmds

### Assess the goodness of fit
mite.spe.nmds$stress
stressplot(mite.spe.nmds, main='Shepard plot')

### Construct the biplot
windows()
plot(mite.spe.nmds, type="none", main=paste('NMDS/Bray - Stress=', round(mite.spe.nmds$stress, 3)),
     xlab=c("NMDS1"),
     ylab=c("NMDS2"))
points(scores(mite.spe.nmds, display="sites", choices=c(1,2)),
       pch=21, col="black", bg="steelblue", cex=1.2)
text(scores(mite.spe.nmds, display="species", choices=c(1)),
     scores(mite.spe.nmds, display="species", choices=c(2)),
     labels=rownames(scores(mite.spe.nmds, display="species")),
     col="red", cex=0.8)