Browse Source

update R stats course material

master
chassenr 1 year ago
parent
commit
6168c517e9
  1. 107
      R_statistics_2022/R_stats_script.R

107
R_statistics_2022/R_stats_script.R

@ -352,7 +352,7 @@ kw_no2_post <- lapply(
# QUESTION: How does the significance change with another p.adjust.method? What is your overall interpretation of the results?
### Principal component analysis ####
### Principal component analysis (PCA) ####
# The math behind PCA and RDA is very similar --> the difference is that RDA is constrained,
# while PCA is not. So we can use the same R function, but without providing any predictor variables.
@ -365,12 +365,17 @@ kw_no2_post <- lapply(
# data with a linear correlation structure.
env_pca <- rda(meta[, 7:11], scale = T)
# This unconstrained ordination object can be found in $CA of the env_pca object.
# To explore this object further, use str() and the help function.
# Look at contribution of all PCs (eigenvalues)
barplot(env_pca$CA$eig/sum(env_pca$CA$eig))
# Interpretation of PCA: https://sites.google.com/site/mb3gustame/indirect-gradient-analysis/pca
# Scaling 2: correlation plot
# I am jumping ahead here already to the plotting workshop...
# save coordinates of default plot as speparate R object
env_pca_scaling2 <- plot(env_pca)
env_pca_scaling2 <- plot(env_pca, scaling = 2)
# start with empty plot
# this will define the range for x and y axis automatically
@ -428,7 +433,7 @@ legend(
title = "Salinity"
)
legend(
"bottomright",
"topleft",
legend = levels(meta$day),
col = day.color,
pch = 15,
@ -440,7 +445,7 @@ legend(
### Hierarchical clustering ####
# Calcualte Bray-Curtis dissimilarities
# Calculate Bray-Curtis dissimilarities
bc_pro <- vegdist(t(otu_pro_rel))
# COMMENT: It is very important that Bray-Curtis dissimilarities are calculated based on the percentages
@ -452,7 +457,8 @@ bc_pro_clust <- hclust(bc_pro, method = "average")
bc_pro_clust <- hclust(bc_pro, method = "single")
# Determine optimal linkage algorithm
cor(bc_pro, cophenetic(bc_pro_clust))
stats::cor(bc_pro, cophenetic(bc_pro_clust))
plot(bc_pro, cophenetic(bc_pro_clust))
# topology of average linkage cluster diagram has highest correlation with input dissimilarity matrix
# Quick and dirty plot
@ -465,7 +471,7 @@ rect.hclust(bc_pro_clust, h = 0.5)
bc_pro_groups <- cutree(bc_pro_clust, h = 0.5)
### Principal coordinate analysis ####
### Principal coordinate analysis (PCoA) ####
# In R, the easiest way to calculate a PCoA is to use the betadisper function.
# This function also assesses within-group heterogeneity.
@ -476,10 +482,14 @@ betadisper_pro <- betadisper(
meta$salinity,
sqrt.dist = T
)
str(betadisper_pro)
# Show PCoA
plot(betadisper_pro)
# Extract PCoA object for different plotting layout
View(betadisper_pro$vectors)
# Get eigenvalues
barplot(betadisper_pro$eig/sum(betadisper_pro$eig) * 100, las = 2, cex.names = 0.5, ylab = "% variation")
@ -489,7 +499,7 @@ barplot(betadisper_pro$eig/sum(betadisper_pro$eig) * 100, las = 2, cex.names = 0
boxplot(betadisper_pro)
### Non-metric multidimensional scaling ####
### Non-metric multidimensional scaling (NMDS) ####
# Calculate NMDS based on Bray-Curtis dissimilarity (default)
# It is possible to supply the matrix with the proportions directly, or already the
@ -520,6 +530,13 @@ points(
# QUESTION: How well represented are the original Bray-Curtis dissimilarities?
# The original Bray-Curtis dissimilarities should always be considered in the interpretation.
# Fit additional variable onto the ordination
envfit_pro <- envfit(nmds_pro, meta[, 7:11])
plot(envfit_pro)
# Check that fitted variables behave linearly before interpreting envfit
ordisurf(nmds_pro, meta$sio2_uM)
### PERMANOVA ####
@ -534,7 +551,7 @@ otu_pro_perm <- otu_pro_perm[rowSums(otu_pro_perm) > 0, ]
perm_ctrl <- how(
within = Within(type = "free"),
plots = Plots(strata = paste(meta_stats$salinity, meta_stats$replicate, sep = "_"), type = "free"),
plots = Plots(strata = meta_stats$replicate, type = "free"),
nperm = 999
)
@ -549,7 +566,7 @@ permanova_out <- adonis2(t(otu_pro_perm) ~ day * salinity, data = meta_stats, sq
### ANOSIM ####
# Similar to betadisper, ANOSIM is commonly run on the interaction, but we will use salinity
# as groupsing factor here for demonstration purposes.
# as grouping factor here for demonstration purposes.
# Similar to Kruskal-Wallis, ANOSIM is unifactorial.
# Calculate ANOSIM based on Bray-Curtis dissimilarity
@ -560,12 +577,44 @@ anosim(t(otu_pro_perm), meta_stats$salinity)
# Calculate pairwise ANOSIM statistics
# Download function from: https://raw.githubusercontent.com/chassenr/Tutorials/master/R_roundtable_sequence_analysis/anosimPosthoc.R
source("C:/Users/chassenrueck/Documents/Repos/Tutorials/R_roundtable_sequence_analysis/anosimPosthoc.R")
ANOSIMposthoc=function(M,E,distance="bray", padj="fdr"){
E=droplevels(E)
Mlist=list()
for(i in 1:length(levels(E))){
Mlist[[i]]=M[E==levels(E)[i],]
}
Elist=list()
for(i in 1:length(levels(E))){
Elist[[i]]=as.numeric(E[E==levels(E)[i]])
}
result=list(anosimR=matrix(NA,length(levels(E)),length(levels(E))),
anosimP=matrix(NA,length(levels(E)),length(levels(E))),
anosimPadj=matrix(NA,length(levels(E)),length(levels(E))))
colnames(result$anosimR)=colnames(result$anosimP)=levels(E)
rownames(result$anosimR)=rownames(result$anosimP)=levels(E)
for(i in 1:(length(levels(E))-1)){
for(j in (i+1):length(levels(E))){
temp=anosim(rbind(Mlist[[i]],Mlist[[j]]),c(Elist[[i]],Elist[[j]]),distance=distance)
result$anosimR[j,i]=temp$statistic
result$anosimP[j,i]=temp$signif
}
}
result$anosimPadj=matrix(p.adjust(as.vector(result$anosimP),method=padj,
n=length(which(!is.na(as.vector(result$anosimP))))),
length(levels(E)),length(levels(E)))
colnames(result$anosimPadj)=levels(E)
rownames(result$anosimPadj)=levels(E)
return(result)
}
ANOSIMposthoc(t(otu_pro_perm), meta_stats$salinity)
# I recommend to focus on the ANOSIM R in the interpretation.
# R < 0.3: weak separation
# R < 0.3: weak to no separation
# R > 0.7: strong separation
# see also https://pubmed.ncbi.nlm.nih.gov/17892477/
# QUESTION: What is the difference in the interpretation between PERMANOVA and ANOSIM?
# Can you imagine cases, where there is disagreement between the result of these 2 methods?
@ -579,12 +628,25 @@ ANOSIMposthoc(t(otu_pro_perm), meta_stats$salinity)
simper_pro <- simper(t(otu_pro_perm), meta_stats$salinity)
# Let's look at the contribution of the most important taxa according to simper
barplot(simper_pro$control_psu12$cusum[1:100])
# Which taxa contribute cumulatively to 50% of the observed differences between groups?
simper_pro$control_psu12$species[simper_pro$control_psu12$ord[1:which(simper_pro$control_psu12$cusum >= 0.5)[1]]]
tmp <- simper_pro$control_psu12$species[simper_pro$control_psu12$ord[1:which(simper_pro$control_psu12$cusum >= 0.5)[1]]]
barplot(
otu_pro_perm[tmp, meta_stats$salinity %in% c("control", "psu12")],
col = rainbow(length(tmp)),
las = 2,
cex.names = 0.5
)
### Redundancy Analysis ####
# WARNING: Be sure that the relationships that you are investigating are actually linear,
# when applying this method.
# Inspect collinearity between predictor variables.
# As RDA will assume linear combinations of the predictors, we will use pearson correlations.
# As a rule of thumb, anything with an |r| > 0.8 will be highly problematic.
@ -594,7 +656,7 @@ cor(meta_stats[, 7:11])
# Our response variable (community matrix) is compositional.
# We will apply a clr-transformation to coerce it into something resembling normality...
# Since the otu count matrix is sparce, rare otus have to be exlcuded
# Since the otu count matrix is sparce, rare otus have to be excluded
otu_pro_sub <- otu_pro[rownames(otu_pro_perm), colnames(otu_pro_perm)]
otu_pro_filt <- otu_pro_sub[apply(otu_pro_perm, 1, function(x) sum(x >= 0.1) >= length(x) * 0.1), ]
@ -659,7 +721,7 @@ plot(rda_otu_varpart)
### Differential abundance ####
# Whether you are working with omics data or any other multivariate response,
# usually the aim is not just to see if there is an overal change, but which variables
# usually the aim is not just to see if there is an overall change, but which variables
# (taxa) are responsible.
# For compositional sequence data, there is a package called ALDEx2, which accounts for compositionality
@ -676,6 +738,10 @@ otu_pro_d21 <- otu_pro[, rownames(meta_d21)]
otu_pro_d21_filt <- otu_pro_d21[apply(prop.table(otu_pro_d21, 2) * 100, 1, function(x) sum(x >= 0.1) >= 3), ]
dim(otu_pro_d21_filt)
# Removal of sequence
# I would be careful witha filter that is removing too much of your data (i.e. more than 30-50%)
summary(colSums(otu_pro_d21_filt)/colSums(otu_pro_d21))
# Check that filtering did not change beta diversity patterns
plot(
vegdist(t(prop.table(otu_pro_d21, 2) * 100)),
@ -701,13 +767,19 @@ sum(aldex_out$glm.eBH <= 0.05)
summary(aldex_out$kw.eBH)
hist(aldex_out$kw.eBH, breaks = 50)
abline(v = 0.05)
sum(aldex_out$kw.eBH <= 0.05)
sum(aldex_out$kw.eBH <= 0.1)
# How many ASVs detected as differentially enriched?
# How many ASVs are detected as differentially enriched?
# Due to the low sample size, it is possible that p-values are not reliable.
# Especially for non-parametric tests, it may not be mathematically possible to obtain
# p-values low enough to not be above the significance threshold after correction.
# I recommend to use the p-values obtained here, only as a means to find a starting point
# for the interpretation, i.e. to choose some taxa (out of hundreds or thousands) to begin
# describing patterns between experimental conditions. If those patterns do not make sense, i.e.
# they are not supported by independent analyses or previous literature, I would use them only
# to postulate new hypotheses and not to make final conclusions about an ecological phenomenon.
# Compare parametric and non-parametric results
plot(aldex_out$kw.ep, aldex_out$glm.eBH)
abline(v = 0.05, h = 0.05)
@ -729,7 +801,7 @@ dim(otu_pro_diff_filt)
# This is actually the first time, that we would like to get more than just a sequence number for the taxa.
tax_pro_diff_filt <- tax_pro[rownames(otu_pro_diff_filt), ]
# While it is usually recommended to build a heatmap from sratch using basic plotting elements,
# While it is usually recommended to build a heatmap from scratch using basic plotting elements,
# we will use a shortcut here (provided by the gplots package).
# For better visibility of small changes, show square-root transformed proportions.
heatmap.2(
@ -935,7 +1007,6 @@ View(tax_pro[cluster.pick.rep[[i]], ])
# Correlation of eigengenes with environmental parameters
cor_me_env <- cor(meta_stats[, 7:11], cluster.ME$eigengenes)
# There seems to be something going on between po4 and M2+4+10, no3 and M5, no2 and M6, nh4 and M9...
# However, who says that the relationship should be expected to be linear?
# Also, we did not check assumptions...
# In general, such an analysis may be more suitable for environmental samples...

Loading…
Cancel
Save