### update R stats course material

master
chassenr 1 year ago
parent
commit
6168c517e9
1 changed files with 89 additions and 18 deletions
1. 107
R_statistics_2022/R_stats_script.R

#### 107 R_statistics_2022/R_stats_script.R Unescape Escape View File

 `@ -352,7 +352,7 @@ kw_no2_post <- lapply(` `# QUESTION: How does the significance change with another p.adjust.method? What is your overall interpretation of the results?` ``` ``` ``` ``` `### Principal component analysis ####` `### Principal component analysis (PCA) ####` ``` ``` `# The math behind PCA and RDA is very similar --> the difference is that RDA is constrained,` `# while PCA is not. So we can use the same R function, but without providing any predictor variables.` `@ -365,12 +365,17 @@ kw_no2_post <- lapply(` `# data with a linear correlation structure.` `env_pca <- rda(meta[, 7:11], scale = T)` ``` ``` `# This unconstrained ordination object can be found in \$CA of the env_pca object.` `# To explore this object further, use str() and the help function.` `# Look at contribution of all PCs (eigenvalues)` `barplot(env_pca\$CA\$eig/sum(env_pca\$CA\$eig))` ``` ``` `# Interpretation of PCA: https://sites.google.com/site/mb3gustame/indirect-gradient-analysis/pca` `# Scaling 2: correlation plot` ``` ``` `# I am jumping ahead here already to the plotting workshop...` `# save coordinates of default plot as speparate R object` `env_pca_scaling2 <- plot(env_pca)` `env_pca_scaling2 <- plot(env_pca, scaling = 2)` ``` ``` `# start with empty plot` `# this will define the range for x and y axis automatically` `@ -428,7 +433,7 @@ legend(` ` title = "Salinity"` `)` `legend(` ` "bottomright",` ` "topleft",` ` legend = levels(meta\$day),` ` col = day.color,` ` pch = 15,` `@ -440,7 +445,7 @@ legend(` ``` ``` `### Hierarchical clustering ####` ``` ``` `# Calcualte Bray-Curtis dissimilarities` `# Calculate Bray-Curtis dissimilarities` `bc_pro <- vegdist(t(otu_pro_rel))` ``` ``` `# COMMENT: It is very important that Bray-Curtis dissimilarities are calculated based on the percentages ` `@ -452,7 +457,8 @@ bc_pro_clust <- hclust(bc_pro, method = "average")` `bc_pro_clust <- hclust(bc_pro, method = "single")` ``` ``` `# Determine optimal linkage algorithm` `cor(bc_pro, cophenetic(bc_pro_clust))` `stats::cor(bc_pro, cophenetic(bc_pro_clust))` `plot(bc_pro, cophenetic(bc_pro_clust))` `# topology of average linkage cluster diagram has highest correlation with input dissimilarity matrix` ``` ``` `# Quick and dirty plot` `@ -465,7 +471,7 @@ rect.hclust(bc_pro_clust, h = 0.5)` `bc_pro_groups <- cutree(bc_pro_clust, h = 0.5)` ``` ``` ``` ``` `### Principal coordinate analysis ####` `### Principal coordinate analysis (PCoA) ####` ``` ``` `# In R, the easiest way to calculate a PCoA is to use the betadisper function.` `# This function also assesses within-group heterogeneity.` `@ -476,10 +482,14 @@ betadisper_pro <- betadisper(` ` meta\$salinity,` ` sqrt.dist = T` `)` `str(betadisper_pro)` ``` ``` `# Show PCoA` `plot(betadisper_pro)` ``` ``` `# Extract PCoA object for different plotting layout` `View(betadisper_pro\$vectors)` ``` ``` `# Get eigenvalues` `barplot(betadisper_pro\$eig/sum(betadisper_pro\$eig) * 100, las = 2, cex.names = 0.5, ylab = "% variation")` ``` ``` `@ -489,7 +499,7 @@ barplot(betadisper_pro\$eig/sum(betadisper_pro\$eig) * 100, las = 2, cex.names = 0` `boxplot(betadisper_pro)` ``` ``` ``` ``` `### Non-metric multidimensional scaling ####` `### Non-metric multidimensional scaling (NMDS) ####` ``` ``` `# Calculate NMDS based on Bray-Curtis dissimilarity (default)` `# It is possible to supply the matrix with the proportions directly, or already the ` `@ -520,6 +530,13 @@ points(` `# QUESTION: How well represented are the original Bray-Curtis dissimilarities? ` `# The original Bray-Curtis dissimilarities should always be considered in the interpretation.` ``` ``` `# Fit additional variable onto the ordination` `envfit_pro <- envfit(nmds_pro, meta[, 7:11])` `plot(envfit_pro)` ``` ``` `# Check that fitted variables behave linearly before interpreting envfit` `ordisurf(nmds_pro, meta\$sio2_uM)` ``` ``` ``` ``` `### PERMANOVA ####` ``` ``` `@ -534,7 +551,7 @@ otu_pro_perm <- otu_pro_perm[rowSums(otu_pro_perm) > 0, ]` ``` ``` `perm_ctrl <- how(` ` within = Within(type = "free"),` ` plots = Plots(strata = paste(meta_stats\$salinity, meta_stats\$replicate, sep = "_"), type = "free"),` ` plots = Plots(strata = meta_stats\$replicate, type = "free"),` ` nperm = 999` `)` ``` ``` `@ -549,7 +566,7 @@ permanova_out <- adonis2(t(otu_pro_perm) ~ day * salinity, data = meta_stats, sq` `### ANOSIM ####` ``` ``` `# Similar to betadisper, ANOSIM is commonly run on the interaction, but we will use salinity` `# as groupsing factor here for demonstration purposes.` `# as grouping factor here for demonstration purposes.` `# Similar to Kruskal-Wallis, ANOSIM is unifactorial.` ``` ``` `# Calculate ANOSIM based on Bray-Curtis dissimilarity` `@ -560,12 +577,44 @@ anosim(t(otu_pro_perm), meta_stats\$salinity)` `# Calculate pairwise ANOSIM statistics` `# Download function from: https://raw.githubusercontent.com/chassenr/Tutorials/master/R_roundtable_sequence_analysis/anosimPosthoc.R` ``` ``` `source("C:/Users/chassenrueck/Documents/Repos/Tutorials/R_roundtable_sequence_analysis/anosimPosthoc.R")` `ANOSIMposthoc=function(M,E,distance="bray", padj="fdr"){` ` E=droplevels(E)` ` Mlist=list()` ` for(i in 1:length(levels(E))){` ` Mlist[[i]]=M[E==levels(E)[i],]` ` }` ` ` ` Elist=list()` ` for(i in 1:length(levels(E))){` ` Elist[[i]]=as.numeric(E[E==levels(E)[i]])` ` }` ` ` ` result=list(anosimR=matrix(NA,length(levels(E)),length(levels(E))),` ` anosimP=matrix(NA,length(levels(E)),length(levels(E))),` ` anosimPadj=matrix(NA,length(levels(E)),length(levels(E))))` ` colnames(result\$anosimR)=colnames(result\$anosimP)=levels(E)` ` rownames(result\$anosimR)=rownames(result\$anosimP)=levels(E)` ` for(i in 1:(length(levels(E))-1)){` ` for(j in (i+1):length(levels(E))){` ` temp=anosim(rbind(Mlist[[i]],Mlist[[j]]),c(Elist[[i]],Elist[[j]]),distance=distance)` ` result\$anosimR[j,i]=temp\$statistic` ` result\$anosimP[j,i]=temp\$signif` ` }` ` }` ` result\$anosimPadj=matrix(p.adjust(as.vector(result\$anosimP),method=padj,` ` n=length(which(!is.na(as.vector(result\$anosimP))))),` ` length(levels(E)),length(levels(E)))` ` colnames(result\$anosimPadj)=levels(E)` ` rownames(result\$anosimPadj)=levels(E)` ` return(result)` `}` ``` ``` `ANOSIMposthoc(t(otu_pro_perm), meta_stats\$salinity)` ``` ``` `# I recommend to focus on the ANOSIM R in the interpretation.` `# R < 0.3: weak separation` `# R < 0.3: weak to no separation` `# R > 0.7: strong separation` `# see also https://pubmed.ncbi.nlm.nih.gov/17892477/` ``` ``` `# QUESTION: What is the difference in the interpretation between PERMANOVA and ANOSIM?` `# Can you imagine cases, where there is disagreement between the result of these 2 methods?` `@ -579,12 +628,25 @@ ANOSIMposthoc(t(otu_pro_perm), meta_stats\$salinity)` ``` ``` `simper_pro <- simper(t(otu_pro_perm), meta_stats\$salinity)` ``` ``` `# Let's look at the contribution of the most important taxa according to simper` `barplot(simper_pro\$control_psu12\$cusum[1:100])` ``` ``` `# Which taxa contribute cumulatively to 50% of the observed differences between groups?` `simper_pro\$control_psu12\$species[simper_pro\$control_psu12\$ord[1:which(simper_pro\$control_psu12\$cusum >= 0.5)[1]]]` `tmp <- simper_pro\$control_psu12\$species[simper_pro\$control_psu12\$ord[1:which(simper_pro\$control_psu12\$cusum >= 0.5)[1]]]` ``` ``` `barplot(` ` otu_pro_perm[tmp, meta_stats\$salinity %in% c("control", "psu12")],` ` col = rainbow(length(tmp)),` ` las = 2,` ` cex.names = 0.5` `)` ``` ``` ``` ``` `### Redundancy Analysis ####` ``` ``` `# WARNING: Be sure that the relationships that you are investigating are actually linear,` `# when applying this method.` ``` ``` `# Inspect collinearity between predictor variables.` `# As RDA will assume linear combinations of the predictors, we will use pearson correlations.` `# As a rule of thumb, anything with an |r| > 0.8 will be highly problematic.` `@ -594,7 +656,7 @@ cor(meta_stats[, 7:11])` ``` ``` `# Our response variable (community matrix) is compositional.` `# We will apply a clr-transformation to coerce it into something resembling normality...` `# Since the otu count matrix is sparce, rare otus have to be exlcuded` `# Since the otu count matrix is sparce, rare otus have to be excluded` ``` ``` `otu_pro_sub <- otu_pro[rownames(otu_pro_perm), colnames(otu_pro_perm)]` `otu_pro_filt <- otu_pro_sub[apply(otu_pro_perm, 1, function(x) sum(x >= 0.1) >= length(x) * 0.1), ]` `@ -659,7 +721,7 @@ plot(rda_otu_varpart)` `### Differential abundance ####` ``` ``` `# Whether you are working with omics data or any other multivariate response,` `# usually the aim is not just to see if there is an overal change, but which variables ` `# usually the aim is not just to see if there is an overall change, but which variables ` `# (taxa) are responsible. ` ``` ``` `# For compositional sequence data, there is a package called ALDEx2, which accounts for compositionality` `@ -676,6 +738,10 @@ otu_pro_d21 <- otu_pro[, rownames(meta_d21)]` `otu_pro_d21_filt <- otu_pro_d21[apply(prop.table(otu_pro_d21, 2) * 100, 1, function(x) sum(x >= 0.1) >= 3), ]` `dim(otu_pro_d21_filt)` ``` ``` `# Removal of sequence` `# I would be careful witha filter that is removing too much of your data (i.e. more than 30-50%)` `summary(colSums(otu_pro_d21_filt)/colSums(otu_pro_d21))` ``` ``` `# Check that filtering did not change beta diversity patterns` `plot(` ` vegdist(t(prop.table(otu_pro_d21, 2) * 100)),` `@ -701,13 +767,19 @@ sum(aldex_out\$glm.eBH <= 0.05)` `summary(aldex_out\$kw.eBH)` `hist(aldex_out\$kw.eBH, breaks = 50)` `abline(v = 0.05)` `sum(aldex_out\$kw.eBH <= 0.05)` `sum(aldex_out\$kw.eBH <= 0.1)` ``` ``` `# How many ASVs detected as differentially enriched?` `# How many ASVs are detected as differentially enriched?` `# Due to the low sample size, it is possible that p-values are not reliable.` `# Especially for non-parametric tests, it may not be mathematically possible to obtain ` `# p-values low enough to not be above the significance threshold after correction.` ``` ``` `# I recommend to use the p-values obtained here, only as a means to find a starting point` `# for the interpretation, i.e. to choose some taxa (out of hundreds or thousands) to begin` `# describing patterns between experimental conditions. If those patterns do not make sense, i.e.` `# they are not supported by independent analyses or previous literature, I would use them only` `# to postulate new hypotheses and not to make final conclusions about an ecological phenomenon.` ``` ``` `# Compare parametric and non-parametric results` `plot(aldex_out\$kw.ep, aldex_out\$glm.eBH)` `abline(v = 0.05, h = 0.05)` `@ -729,7 +801,7 @@ dim(otu_pro_diff_filt)` `# This is actually the first time, that we would like to get more than just a sequence number for the taxa.` `tax_pro_diff_filt <- tax_pro[rownames(otu_pro_diff_filt), ]` ``` ``` `# While it is usually recommended to build a heatmap from sratch using basic plotting elements,` `# While it is usually recommended to build a heatmap from scratch using basic plotting elements,` `# we will use a shortcut here (provided by the gplots package).` `# For better visibility of small changes, show square-root transformed proportions.` `heatmap.2(` `@ -935,7 +1007,6 @@ View(tax_pro[cluster.pick.rep[[i]], ])` `# Correlation of eigengenes with environmental parameters` `cor_me_env <- cor(meta_stats[, 7:11], cluster.ME\$eigengenes)` ``` ``` `# There seems to be something going on between po4 and M2+4+10, no3 and M5, no2 and M6, nh4 and M9...` `# However, who says that the relationship should be expected to be linear?` `# Also, we did not check assumptions... ` `# In general, such an analysis may be more suitable for environmental samples...`