Browse Source

added R script for microbial ecology practical 2023

master
chassenr 4 months ago
parent
commit
dafd44b578
  1. 485
      Microbio_practical_2023/R_script_practical.R

485
Microbio_practical_2023/R_script_practical.R

@ -0,0 +1,485 @@
#################################
# Analysis of 16S amplicon data #
#################################
### prepare environment ####
# set working directory
setwd("C:/Users/chassenrueck/Documents/IOW_home/Teaching/Practical_Jürgens_Kremp_2023/Bioinformatik/")
# load packages
# if not currently installed,
# click on packages (next to the plot tab in the bottom right pane)
# then, there is a install button
# or run: install.packages("vegan")
require(vegan)
require(iNEXT)
require(gplots)
require(dada2) # Hint: this package is available on bioconductor
# loading and saving workspace
# if you saved the workspace from the R intro, you can continue working with this one here
load("practical_dada2.Rdata")
# save.image("practical_dada2.Rdata")
load("practical_visualization.Rdata")
# save.image("practical_visualization.Rdata")
### Prepare files ####
# define sample names
sample.names <- c("S44_28_20", "S91_74_25", "S66_47_20", "S71_51_25")
# load metadtaa into R
META <- read.table(
"results_read_run_tsv.txt",
h = T,
sep = "\t"
)
# There are also 270 data sets in the metadata table, only 4 of which will be used here.
# Be aware that each sample title occurs twice, since taxonomic composition was assessed on both
# RNA and DNA level in the same sample. We need the DNA (library source = METAGENOMIC) data here.
# subset to the samples of interest
META <- META[META$library_source == "METAGENOMIC" & META$sample_title %in% sample.names,]
# check the order of observations in META. Is it the same as in sample.names?
all.equal(sample.names, META$sample_title)
# change order of observations in META to match sample.names
rownames(META) <- META$sample_title
META <- META[sample.names, ]
# get download links for fastq files
strsplit(META$fastq_ftp, ";")
# download files and save them in working directory
# specify path to input sequences
fnFs <- paste0(META$run_accession, "_1.fastq")
fnRs <- paste0(META$run_accession, "_2.fastq")
names(fnFs) <- sample.names
names(fnRs) <- sample.names
# define location of quality trimmed sequences
filtFs <- file.path("Filtered", paste0(META$run_accession, "_filt_R1.fastq"))
filtRs <- file.path("Filtered", paste0(META$run_accession, "_filt_R2.fastq"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
### Quality trimming ####
# show the per base quality of the sequencing data.
# What is your impression of the sequence quality? Is it sufficient?
plotQualityProfile(fnFs)
plotQualityProfile(fnRs)
# At which base position would you cut-off the low quality end of the sequence?
# You should plan for an overlap of 30bp and the maximum expected fragment length is 430bp
filt.out <- filterAndTrim(
fwd = fnFs,
filt = filtFs,
rev = fnRs,
filt.rev = filtRs,
truncLen = c(230, 230),
maxN = 0,
minQ = 2,
maxEE = c(2, 2),
truncQ = 0,
rm.phix = TRUE,
compress = F,
multithread = F
)
# How do the quality profiles look now?
# How many sequences were discarded?
# Which setting are the best,
# i.e. best compromise between achieving high quality and loosing too many sequences?
### Denoising ####
# Error learning
errF <- learnErrors(filtFs, multithread = F, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR <- learnErrors(filtRs, multithread = F, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
# inspect error profiles
plotErrors(errF, nominalQ = TRUE)
plotErrors(errR, nominalQ = TRUE)
# What is your assessment of how well error rates are approximated?
# Deviations in which directions are more detrimental?
# apply denoising
dadaFs <- dada(filtFs, err = errF, multithread = F, pool = "pseudo")
dadaRs <- dada(filtRs, err = errR, multithread = F, pool = "pseudo")
# What is the ratio between unique and total number of reads?
# What is the meaning of the parameter 'pool'?
# How will different settings (TRUE, FALSE, pseudo) affect your results and downstream processing steps?
### Merged paired-end reads ####
# aligned forward and reverse reads and stitch together based on overlap region
mergers <- mergePairs(
dadaFs,
filtFs,
dadaRs,
filtRs,
minOverlap = 10,
verbose = TRUE
)
# create sequence table
seqtab <- makeSequenceTable(mergers)
# How many amplicon sequence variants (ASVs) were detected in the data set?
### Chimera detection ####
# identify ASVs that are hybrids of different templates
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = F, verbose = TRUE)
# How many ASVs did survive?
# Which fraction of the community is kept?
# Sequence length distribution
table(rep(nchar(colnames(seqtab.nochim)), colSums(seqtab.nochim)))
# Are there any sequences that you would discard?
seqtab.nochim2 <- seqtab.nochim[, nchar(colnames(seqtab.nochim)) %in% seq(402, 431) & colSums(seqtab.nochim) > 1]
# Why am I excluding ASVs only occurring once in the data set?
# How many sequences survive the analysis workflow until now?
# How much of the initial input data was removed?
### Taxonomic classification ####
# download the taxonomic reference:
# https://zenodo.org/record/4587955/files/silva_nr99_v138.1_train_set.fa.gz
tax <- assignTaxonomy(
seqtab.nochim2,
"silva_nr99_v138.1_train_set.fa.gz",
multithread = F,
minBoot = 70,
outputBootstraps = F,
taxLevels = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
)
tax <- data.frame(tax)
# Remove unwanted lineages
tax.good <- tax[tax$Kingdom == "Bacteria" & !grepl("[Cc]hloroplast", tax$Order) & !grepl("[Mm]itochondria", tax$Family), ]
# How many ASVs are kept?
# Which fraction of the community was removed?
# How many ASV as unclassified at each taxonomic level?
apply(tax.good, 2, function(x) sum(is.na(x)))
# Quick-and-dirty solution to NAs in tax table
tax.clean <- tax.good
for(i in 2:ncol(tax.good)) {
tax.clean[, i] <- ifelse(
is.na(tax.clean[, i]),
paste0(gsub("_unclassified", "", tax.clean[, i - 1]), "_unclassified"),
tax.clean[, i]
)
}
### Format tables and write output ####
# subset ASV table to classified sequence
seqtab.clean <- seqtab.nochim2[, rownames(tax.clean)]
# format sequence names
seqtab.clean.print <- seqtab.clean
colnames(seqtab.clean.print) <- paste("asv", 1:ncol(seqtab.clean), sep = "_")
# write files
# for easier readability, transpose ASV table (not strictly necessary, just out of convenience)
write.table(
t(seqtab.clean.print),
"asvtab.txt",
quote = F,
sep = "\t"
)
# save sequences in separate fasta file
uniquesToFasta(seqtab.clean, "seqs.fasta", ids = colnames(seqtab.clean.print))
# write taxonomy table
tax.clean.print <- tax.clean
rownames(tax.clean.print) <- colnames(seqtab.clean.print)
write.table(
tax.clean.print,
"taxtab.txt",
quote = F,
sep = "\t"
)
# also save reduced set of metadata parameters
write.table(
META[, c("lat", "lon", "sample_description")],
"metadata.txt",
quote = F,
sep = "\t"
)
# Don't forget to save your workspace
# Then remove all items from workspace and start fresh environment
### Read results of dada2 pipeline ####
# ASV table
ASV <- read.table(
"asvtab.txt",
h = T,
sep = "\t",
row.names = 1
)
# taxonomy of microbial community
TAX <- read.table(
"taxtab.txt",
h = T,
sep = "\t",
row.names = 1,
comment.char = "",
quote = ""
)
# sample metadata (experimental conditions)
META <- read.table(
"metadata.txt",
h = T,
sep = "\t"
)
# check data types
str(ASV)
str(TAX)
str(META)
# make sure that objects are in the correct orientation/order
all.equal(rownames(META), colnames(ASV))
all.equal(rownames(ASV), rownames(TAX))
# since the non-bloom stations are further west, we can use longitude to define colors according to bloom stage
META$color <- ifelse(
META$lon < -40,
"blue",
"green"
)
# calculate proportions
ASV.rel <- prop.table(as.matrix(ASV), 2) * 100
# Why can't we use sequence counts directly for the further analysis?
# summarize sequence counts at each taxonomic level
# create an empty list
TAX.pooled <- vector(mode = "list", length = 6)
names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus
# fill elements of list with data frames with sequence counts per taxon,
# one taxonomic level at a time
for (i in 1:6) {
temp <- aggregate(
ASV,
by = list(TAX[, i]),
FUN = sum
)
rownames(temp) <- temp$Group.1
TAX.pooled[[i]] <- as.matrix(temp[, -1])
rm(temp)
}
# calculate percentages for each taxonomic level using lapply
TAX.pooled.rel <- lapply(
TAX.pooled,
function(x) {
prop.table(x, 2) * 100
}
)
View(TAX.pooled.rel$Phylum)
### rarefaction curves ####
# we will use the iNEXT package to generate the input for the rarefaction curves
# for each sample, alpha diversity will be estimated at 250 different sequencing depths
# rarefaction curves will be drawn for Hill numbers 0, 1, 2 corresponding to:
# richness (0)
# exponential Shannon (1)
# inverse Simpson (2)
# calculate diversity for each sample at each sequencing depth
inext_out <- lapply(
1:ncol(ASV),
function(x) {
iNEXT(
ASV[, x],
q = c(0, 1, 2),
datatype = "abundance",
endpoint = colSums(ASV)[x],
knots = 250,
se = F,
nboot = 5
)
}
)
# extract output in easier form for plotting
# for each hill number there will be 2 tables with x and y for the lines to plot
# for each sample (column)
inext_parsed <- lapply(
0:2,
function(y) {
list(
seq_depth = sapply(inext_out, function(x) { x$iNextEst$m[x$iNextEst$order == y] }),
alpha_div = sapply(inext_out, function(x) { x$iNextEst$qD[x$iNextEst$order == y] })
)
}
)
# create multi-panel plot
par(mfrow = c(3, 1), mar = c(1, 5, 1, 1), oma = c(4, 0, 0, 0))
# for each Hill number
for(i in 1:length(inext_parsed)) {
# create empty plot of correct dimensions
# already add y-axis label
plot(
0, 0,
type = "n",
xlim = c(0, max(inext_parsed[[i]]$seq_depth)),
ylim = c(0, max(inext_parsed[[i]]$alpha_div)),
xlab = "sequencing depth",
ylab = paste0("Alpha diversity (q = ", i - 1, ")"),
axes = F,
cex.lab = 1.3
)
# draw box around plot
box("plot")
# add y-axis
axis(2, las = 1)
# add x-axis, but only label ticks in last plot
if(i == length(inext_parsed)) {
axis(1)
} else {
axis(1, labels = NA)
}
# for each sample, draw the rarefaction curve...
for(j in 1:ncol(inext_parsed[[i]]$seq_depth)) {
lines(
inext_parsed[[i]]$seq_depth[, j],
inext_parsed[[i]]$alpha_div[, j],
col = META$color[j]
)
# ... and add the sample name to the end of the curve
text(
max(inext_parsed[[i]]$seq_depth[, j]),
max(inext_parsed[[i]]$alpha_div[, j]),
col = META$color[j],
pos = 1,
labels = colnames(ASV)[j]
)
}
}
# add x-axis label
mtext(text = "Sequencing depth", side = 1, line = 2.5)
# Are there differences in alpha diversity between samples?
# Which samples are more diverse and why?
# Did you observe similar trends in the incubation experiment (phytoplankton addition) based on SSCP?
# What do these rarefaction curves tell you about how sufficient the sequencing was?
# Is it necessary to rarefy (subsample) the data set to compare alpha diversity between samples?
### Dissimilarity ####
# Bray-Curtis dissimilarity
bc <- vegdist(t(ASV.rel), method = "bray")
bc
# Jaccard dissimilarity
jc <- vegdist(t(ASV.rel), method = "jaccard", binary = T)
jc
# How similar are the samples to each other?
# What is the difference in the interpretation between the 2 dissimilarity measures?
### taxonomic composition ####
# Here we will take a shortcut to produce a stacked barplot, only showing the most dominant taxa
# The following function can be downloaded from: https://raw.githubusercontent.com/chassenr/NGS/master/Plotting/PlotAbund.R
# source("C:/Users/chassenrueck/Documents/Repos/NGS/Plotting/PlotAbund.R")
PlotAbund(
TAX.pooled.rel$Genus,
abund = 5,
method = "percentage",
open.window = T,
plot.ratio = c(3.5, 1),
sort.taxa = T
)
# Which taxa dominate the community at bloom and non-bloom stations?
# What is known about their ecology?
# How are the changes in the taxonomic composition related to the observed patterns in alpha diversity?
# How many different taxonomic groups are found at each level?
sapply(TAX.pooled.rel, nrow)
# How many ASVs are not closely related to known taxa
round(sum(grepl("_unclassified", TAX$Genus))/nrow(TAX) * 100, 2)
### heatmap ####
# select the 5 most dominant ASVs per sample
asv_select <- c(
"asv_1",
"asv_2",
"asv_3",
"asv_4",
"asv_5",
"asv_6",
"asv_7",
"asv_9",
"asv_10"
)
asv_abund <- ASV.rel[asv_select, ]
# Quick-and-dirty heatmap
heatmap.2(
asv_abund,
col = colorRampPalette(c("grey95", "darkred"))(50),
labRow = paste(TAX[asv_select, "genus"], asv_select),
breaks = seq(0, max(asv_abund), length.out = 51),
margins = c(10, 18),
trace = "none",
density.info = "none",
dendrogram = "none",
keysize = 1.3,
key.title = "",
cexCol = 1.5,
cexRow = 1,
key.xlab = "Sequence proportion [%]"
)
# Further questions:
# What are the species/strain level taxonomic affiliations of the closest relatives of these sequences?
# What is the degree of novelty of these sequences?
# Where have similar sequences been found before?
Loading…
Cancel
Save