|
|
|
@ -352,7 +352,7 @@ kw_no2_post <- lapply(
|
|
|
|
|
# QUESTION: How does the significance change with another p.adjust.method? What is your overall interpretation of the results? |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Principal component analysis #### |
|
|
|
|
### Principal component analysis (PCA) #### |
|
|
|
|
|
|
|
|
|
# The math behind PCA and RDA is very similar --> the difference is that RDA is constrained, |
|
|
|
|
# while PCA is not. So we can use the same R function, but without providing any predictor variables. |
|
|
|
@ -365,12 +365,17 @@ kw_no2_post <- lapply(
|
|
|
|
|
# data with a linear correlation structure. |
|
|
|
|
env_pca <- rda(meta[, 7:11], scale = T) |
|
|
|
|
|
|
|
|
|
# This unconstrained ordination object can be found in $CA of the env_pca object. |
|
|
|
|
# To explore this object further, use str() and the help function. |
|
|
|
|
# Look at contribution of all PCs (eigenvalues) |
|
|
|
|
barplot(env_pca$CA$eig/sum(env_pca$CA$eig)) |
|
|
|
|
|
|
|
|
|
# Interpretation of PCA: https://sites.google.com/site/mb3gustame/indirect-gradient-analysis/pca |
|
|
|
|
# Scaling 2: correlation plot |
|
|
|
|
|
|
|
|
|
# I am jumping ahead here already to the plotting workshop... |
|
|
|
|
# save coordinates of default plot as speparate R object |
|
|
|
|
env_pca_scaling2 <- plot(env_pca) |
|
|
|
|
env_pca_scaling2 <- plot(env_pca, scaling = 2) |
|
|
|
|
|
|
|
|
|
# start with empty plot |
|
|
|
|
# this will define the range for x and y axis automatically |
|
|
|
@ -428,7 +433,7 @@ legend(
|
|
|
|
|
title = "Salinity" |
|
|
|
|
) |
|
|
|
|
legend( |
|
|
|
|
"bottomright", |
|
|
|
|
"topleft", |
|
|
|
|
legend = levels(meta$day), |
|
|
|
|
col = day.color, |
|
|
|
|
pch = 15, |
|
|
|
@ -440,7 +445,7 @@ legend(
|
|
|
|
|
|
|
|
|
|
### Hierarchical clustering #### |
|
|
|
|
|
|
|
|
|
# Calcualte Bray-Curtis dissimilarities |
|
|
|
|
# Calculate Bray-Curtis dissimilarities |
|
|
|
|
bc_pro <- vegdist(t(otu_pro_rel)) |
|
|
|
|
|
|
|
|
|
# COMMENT: It is very important that Bray-Curtis dissimilarities are calculated based on the percentages |
|
|
|
@ -452,7 +457,8 @@ bc_pro_clust <- hclust(bc_pro, method = "average")
|
|
|
|
|
bc_pro_clust <- hclust(bc_pro, method = "single") |
|
|
|
|
|
|
|
|
|
# Determine optimal linkage algorithm |
|
|
|
|
cor(bc_pro, cophenetic(bc_pro_clust)) |
|
|
|
|
stats::cor(bc_pro, cophenetic(bc_pro_clust)) |
|
|
|
|
plot(bc_pro, cophenetic(bc_pro_clust)) |
|
|
|
|
# topology of average linkage cluster diagram has highest correlation with input dissimilarity matrix |
|
|
|
|
|
|
|
|
|
# Quick and dirty plot |
|
|
|
@ -465,7 +471,7 @@ rect.hclust(bc_pro_clust, h = 0.5)
|
|
|
|
|
bc_pro_groups <- cutree(bc_pro_clust, h = 0.5) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Principal coordinate analysis #### |
|
|
|
|
### Principal coordinate analysis (PCoA) #### |
|
|
|
|
|
|
|
|
|
# In R, the easiest way to calculate a PCoA is to use the betadisper function. |
|
|
|
|
# This function also assesses within-group heterogeneity. |
|
|
|
@ -476,10 +482,14 @@ betadisper_pro <- betadisper(
|
|
|
|
|
meta$salinity, |
|
|
|
|
sqrt.dist = T |
|
|
|
|
) |
|
|
|
|
str(betadisper_pro) |
|
|
|
|
|
|
|
|
|
# Show PCoA |
|
|
|
|
plot(betadisper_pro) |
|
|
|
|
|
|
|
|
|
# Extract PCoA object for different plotting layout |
|
|
|
|
View(betadisper_pro$vectors) |
|
|
|
|
|
|
|
|
|
# Get eigenvalues |
|
|
|
|
barplot(betadisper_pro$eig/sum(betadisper_pro$eig) * 100, las = 2, cex.names = 0.5, ylab = "% variation") |
|
|
|
|
|
|
|
|
@ -489,7 +499,7 @@ barplot(betadisper_pro$eig/sum(betadisper_pro$eig) * 100, las = 2, cex.names = 0
|
|
|
|
|
boxplot(betadisper_pro) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Non-metric multidimensional scaling #### |
|
|
|
|
### Non-metric multidimensional scaling (NMDS) #### |
|
|
|
|
|
|
|
|
|
# Calculate NMDS based on Bray-Curtis dissimilarity (default) |
|
|
|
|
# It is possible to supply the matrix with the proportions directly, or already the |
|
|
|
@ -520,6 +530,13 @@ points(
|
|
|
|
|
# QUESTION: How well represented are the original Bray-Curtis dissimilarities? |
|
|
|
|
# The original Bray-Curtis dissimilarities should always be considered in the interpretation. |
|
|
|
|
|
|
|
|
|
# Fit additional variable onto the ordination |
|
|
|
|
envfit_pro <- envfit(nmds_pro, meta[, 7:11]) |
|
|
|
|
plot(envfit_pro) |
|
|
|
|
|
|
|
|
|
# Check that fitted variables behave linearly before interpreting envfit |
|
|
|
|
ordisurf(nmds_pro, meta$sio2_uM) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### PERMANOVA #### |
|
|
|
|
|
|
|
|
@ -534,7 +551,7 @@ otu_pro_perm <- otu_pro_perm[rowSums(otu_pro_perm) > 0, ]
|
|
|
|
|
|
|
|
|
|
perm_ctrl <- how( |
|
|
|
|
within = Within(type = "free"), |
|
|
|
|
plots = Plots(strata = paste(meta_stats$salinity, meta_stats$replicate, sep = "_"), type = "free"), |
|
|
|
|
plots = Plots(strata = meta_stats$replicate, type = "free"), |
|
|
|
|
nperm = 999 |
|
|
|
|
) |
|
|
|
|
|
|
|
|
@ -549,7 +566,7 @@ permanova_out <- adonis2(t(otu_pro_perm) ~ day * salinity, data = meta_stats, sq
|
|
|
|
|
### ANOSIM #### |
|
|
|
|
|
|
|
|
|
# Similar to betadisper, ANOSIM is commonly run on the interaction, but we will use salinity |
|
|
|
|
# as groupsing factor here for demonstration purposes. |
|
|
|
|
# as grouping factor here for demonstration purposes. |
|
|
|
|
# Similar to Kruskal-Wallis, ANOSIM is unifactorial. |
|
|
|
|
|
|
|
|
|
# Calculate ANOSIM based on Bray-Curtis dissimilarity |
|
|
|
@ -560,12 +577,44 @@ anosim(t(otu_pro_perm), meta_stats$salinity)
|
|
|
|
|
# Calculate pairwise ANOSIM statistics |
|
|
|
|
# Download function from: https://raw.githubusercontent.com/chassenr/Tutorials/master/R_roundtable_sequence_analysis/anosimPosthoc.R |
|
|
|
|
|
|
|
|
|
source("C:/Users/chassenrueck/Documents/Repos/Tutorials/R_roundtable_sequence_analysis/anosimPosthoc.R") |
|
|
|
|
ANOSIMposthoc=function(M,E,distance="bray", padj="fdr"){ |
|
|
|
|
E=droplevels(E) |
|
|
|
|
Mlist=list() |
|
|
|
|
for(i in 1:length(levels(E))){ |
|
|
|
|
Mlist[[i]]=M[E==levels(E)[i],] |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Elist=list() |
|
|
|
|
for(i in 1:length(levels(E))){ |
|
|
|
|
Elist[[i]]=as.numeric(E[E==levels(E)[i]]) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
result=list(anosimR=matrix(NA,length(levels(E)),length(levels(E))), |
|
|
|
|
anosimP=matrix(NA,length(levels(E)),length(levels(E))), |
|
|
|
|
anosimPadj=matrix(NA,length(levels(E)),length(levels(E)))) |
|
|
|
|
colnames(result$anosimR)=colnames(result$anosimP)=levels(E) |
|
|
|
|
rownames(result$anosimR)=rownames(result$anosimP)=levels(E) |
|
|
|
|
for(i in 1:(length(levels(E))-1)){ |
|
|
|
|
for(j in (i+1):length(levels(E))){ |
|
|
|
|
temp=anosim(rbind(Mlist[[i]],Mlist[[j]]),c(Elist[[i]],Elist[[j]]),distance=distance) |
|
|
|
|
result$anosimR[j,i]=temp$statistic |
|
|
|
|
result$anosimP[j,i]=temp$signif |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
result$anosimPadj=matrix(p.adjust(as.vector(result$anosimP),method=padj, |
|
|
|
|
n=length(which(!is.na(as.vector(result$anosimP))))), |
|
|
|
|
length(levels(E)),length(levels(E))) |
|
|
|
|
colnames(result$anosimPadj)=levels(E) |
|
|
|
|
rownames(result$anosimPadj)=levels(E) |
|
|
|
|
return(result) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ANOSIMposthoc(t(otu_pro_perm), meta_stats$salinity) |
|
|
|
|
|
|
|
|
|
# I recommend to focus on the ANOSIM R in the interpretation. |
|
|
|
|
# R < 0.3: weak separation |
|
|
|
|
# R < 0.3: weak to no separation |
|
|
|
|
# R > 0.7: strong separation |
|
|
|
|
# see also https://pubmed.ncbi.nlm.nih.gov/17892477/ |
|
|
|
|
|
|
|
|
|
# QUESTION: What is the difference in the interpretation between PERMANOVA and ANOSIM? |
|
|
|
|
# Can you imagine cases, where there is disagreement between the result of these 2 methods? |
|
|
|
@ -579,12 +628,25 @@ ANOSIMposthoc(t(otu_pro_perm), meta_stats$salinity)
|
|
|
|
|
|
|
|
|
|
simper_pro <- simper(t(otu_pro_perm), meta_stats$salinity) |
|
|
|
|
|
|
|
|
|
# Let's look at the contribution of the most important taxa according to simper |
|
|
|
|
barplot(simper_pro$control_psu12$cusum[1:100]) |
|
|
|
|
|
|
|
|
|
# Which taxa contribute cumulatively to 50% of the observed differences between groups? |
|
|
|
|
simper_pro$control_psu12$species[simper_pro$control_psu12$ord[1:which(simper_pro$control_psu12$cusum >= 0.5)[1]]] |
|
|
|
|
tmp <- simper_pro$control_psu12$species[simper_pro$control_psu12$ord[1:which(simper_pro$control_psu12$cusum >= 0.5)[1]]] |
|
|
|
|
|
|
|
|
|
barplot( |
|
|
|
|
otu_pro_perm[tmp, meta_stats$salinity %in% c("control", "psu12")], |
|
|
|
|
col = rainbow(length(tmp)), |
|
|
|
|
las = 2, |
|
|
|
|
cex.names = 0.5 |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Redundancy Analysis #### |
|
|
|
|
|
|
|
|
|
# WARNING: Be sure that the relationships that you are investigating are actually linear, |
|
|
|
|
# when applying this method. |
|
|
|
|
|
|
|
|
|
# Inspect collinearity between predictor variables. |
|
|
|
|
# As RDA will assume linear combinations of the predictors, we will use pearson correlations. |
|
|
|
|
# As a rule of thumb, anything with an |r| > 0.8 will be highly problematic. |
|
|
|
@ -594,7 +656,7 @@ cor(meta_stats[, 7:11])
|
|
|
|
|
|
|
|
|
|
# Our response variable (community matrix) is compositional. |
|
|
|
|
# We will apply a clr-transformation to coerce it into something resembling normality... |
|
|
|
|
# Since the otu count matrix is sparce, rare otus have to be exlcuded |
|
|
|
|
# Since the otu count matrix is sparce, rare otus have to be excluded |
|
|
|
|
|
|
|
|
|
otu_pro_sub <- otu_pro[rownames(otu_pro_perm), colnames(otu_pro_perm)] |
|
|
|
|
otu_pro_filt <- otu_pro_sub[apply(otu_pro_perm, 1, function(x) sum(x >= 0.1) >= length(x) * 0.1), ] |
|
|
|
@ -659,7 +721,7 @@ plot(rda_otu_varpart)
|
|
|
|
|
### Differential abundance #### |
|
|
|
|
|
|
|
|
|
# Whether you are working with omics data or any other multivariate response, |
|
|
|
|
# usually the aim is not just to see if there is an overal change, but which variables |
|
|
|
|
# usually the aim is not just to see if there is an overall change, but which variables |
|
|
|
|
# (taxa) are responsible. |
|
|
|
|
|
|
|
|
|
# For compositional sequence data, there is a package called ALDEx2, which accounts for compositionality |
|
|
|
@ -676,6 +738,10 @@ otu_pro_d21 <- otu_pro[, rownames(meta_d21)]
|
|
|
|
|
otu_pro_d21_filt <- otu_pro_d21[apply(prop.table(otu_pro_d21, 2) * 100, 1, function(x) sum(x >= 0.1) >= 3), ] |
|
|
|
|
dim(otu_pro_d21_filt) |
|
|
|
|
|
|
|
|
|
# Removal of sequence |
|
|
|
|
# I would be careful witha filter that is removing too much of your data (i.e. more than 30-50%) |
|
|
|
|
summary(colSums(otu_pro_d21_filt)/colSums(otu_pro_d21)) |
|
|
|
|
|
|
|
|
|
# Check that filtering did not change beta diversity patterns |
|
|
|
|
plot( |
|
|
|
|
vegdist(t(prop.table(otu_pro_d21, 2) * 100)), |
|
|
|
@ -701,13 +767,19 @@ sum(aldex_out$glm.eBH <= 0.05)
|
|
|
|
|
summary(aldex_out$kw.eBH) |
|
|
|
|
hist(aldex_out$kw.eBH, breaks = 50) |
|
|
|
|
abline(v = 0.05) |
|
|
|
|
sum(aldex_out$kw.eBH <= 0.05) |
|
|
|
|
sum(aldex_out$kw.eBH <= 0.1) |
|
|
|
|
|
|
|
|
|
# How many ASVs detected as differentially enriched? |
|
|
|
|
# How many ASVs are detected as differentially enriched? |
|
|
|
|
# Due to the low sample size, it is possible that p-values are not reliable. |
|
|
|
|
# Especially for non-parametric tests, it may not be mathematically possible to obtain |
|
|
|
|
# p-values low enough to not be above the significance threshold after correction. |
|
|
|
|
|
|
|
|
|
# I recommend to use the p-values obtained here, only as a means to find a starting point |
|
|
|
|
# for the interpretation, i.e. to choose some taxa (out of hundreds or thousands) to begin |
|
|
|
|
# describing patterns between experimental conditions. If those patterns do not make sense, i.e. |
|
|
|
|
# they are not supported by independent analyses or previous literature, I would use them only |
|
|
|
|
# to postulate new hypotheses and not to make final conclusions about an ecological phenomenon. |
|
|
|
|
|
|
|
|
|
# Compare parametric and non-parametric results |
|
|
|
|
plot(aldex_out$kw.ep, aldex_out$glm.eBH) |
|
|
|
|
abline(v = 0.05, h = 0.05) |
|
|
|
@ -729,7 +801,7 @@ dim(otu_pro_diff_filt)
|
|
|
|
|
# This is actually the first time, that we would like to get more than just a sequence number for the taxa. |
|
|
|
|
tax_pro_diff_filt <- tax_pro[rownames(otu_pro_diff_filt), ] |
|
|
|
|
|
|
|
|
|
# While it is usually recommended to build a heatmap from sratch using basic plotting elements, |
|
|
|
|
# While it is usually recommended to build a heatmap from scratch using basic plotting elements, |
|
|
|
|
# we will use a shortcut here (provided by the gplots package). |
|
|
|
|
# For better visibility of small changes, show square-root transformed proportions. |
|
|
|
|
heatmap.2( |
|
|
|
@ -935,7 +1007,6 @@ View(tax_pro[cluster.pick.rep[[i]], ])
|
|
|
|
|
# Correlation of eigengenes with environmental parameters |
|
|
|
|
cor_me_env <- cor(meta_stats[, 7:11], cluster.ME$eigengenes) |
|
|
|
|
|
|
|
|
|
# There seems to be something going on between po4 and M2+4+10, no3 and M5, no2 and M6, nh4 and M9... |
|
|
|
|
# However, who says that the relationship should be expected to be linear? |
|
|
|
|
# Also, we did not check assumptions... |
|
|
|
|
# In general, such an analysis may be more suitable for environmental samples... |
|
|
|
|