1 changed files with 485 additions and 0 deletions
@ -0,0 +1,485 @@
|
||||
################################# |
||||
# Analysis of 16S amplicon data # |
||||
################################# |
||||
|
||||
### prepare environment #### |
||||
|
||||
# set working directory |
||||
setwd("C:/Users/chassenrueck/Documents/IOW_home/Teaching/Practical_Jürgens_Kremp_2023/Bioinformatik/") |
||||
|
||||
# load packages |
||||
# if not currently installed, |
||||
# click on packages (next to the plot tab in the bottom right pane) |
||||
# then, there is a install button |
||||
# or run: install.packages("vegan") |
||||
require(vegan) |
||||
require(iNEXT) |
||||
require(gplots) |
||||
require(dada2) # Hint: this package is available on bioconductor |
||||
|
||||
# loading and saving workspace |
||||
# if you saved the workspace from the R intro, you can continue working with this one here |
||||
load("practical_dada2.Rdata") |
||||
# save.image("practical_dada2.Rdata") |
||||
load("practical_visualization.Rdata") |
||||
# save.image("practical_visualization.Rdata") |
||||
|
||||
|
||||
### Prepare files #### |
||||
|
||||
# define sample names |
||||
sample.names <- c("S44_28_20", "S91_74_25", "S66_47_20", "S71_51_25") |
||||
|
||||
# load metadtaa into R |
||||
META <- read.table( |
||||
"results_read_run_tsv.txt", |
||||
h = T, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# There are also 270 data sets in the metadata table, only 4 of which will be used here. |
||||
# Be aware that each sample title occurs twice, since taxonomic composition was assessed on both |
||||
# RNA and DNA level in the same sample. We need the DNA (library source = METAGENOMIC) data here. |
||||
|
||||
# subset to the samples of interest |
||||
META <- META[META$library_source == "METAGENOMIC" & META$sample_title %in% sample.names,] |
||||
|
||||
# check the order of observations in META. Is it the same as in sample.names? |
||||
all.equal(sample.names, META$sample_title) |
||||
|
||||
# change order of observations in META to match sample.names |
||||
rownames(META) <- META$sample_title |
||||
META <- META[sample.names, ] |
||||
|
||||
# get download links for fastq files |
||||
strsplit(META$fastq_ftp, ";") |
||||
# download files and save them in working directory |
||||
|
||||
# specify path to input sequences |
||||
fnFs <- paste0(META$run_accession, "_1.fastq") |
||||
fnRs <- paste0(META$run_accession, "_2.fastq") |
||||
names(fnFs) <- sample.names |
||||
names(fnRs) <- sample.names |
||||
|
||||
# define location of quality trimmed sequences |
||||
filtFs <- file.path("Filtered", paste0(META$run_accession, "_filt_R1.fastq")) |
||||
filtRs <- file.path("Filtered", paste0(META$run_accession, "_filt_R2.fastq")) |
||||
names(filtFs) <- sample.names |
||||
names(filtRs) <- sample.names |
||||
|
||||
|
||||
### Quality trimming #### |
||||
|
||||
# show the per base quality of the sequencing data. |
||||
# What is your impression of the sequence quality? Is it sufficient? |
||||
plotQualityProfile(fnFs) |
||||
plotQualityProfile(fnRs) |
||||
|
||||
# At which base position would you cut-off the low quality end of the sequence? |
||||
# You should plan for an overlap of 30bp and the maximum expected fragment length is 430bp |
||||
|
||||
filt.out <- filterAndTrim( |
||||
fwd = fnFs, |
||||
filt = filtFs, |
||||
rev = fnRs, |
||||
filt.rev = filtRs, |
||||
truncLen = c(230, 230), |
||||
maxN = 0, |
||||
minQ = 2, |
||||
maxEE = c(2, 2), |
||||
truncQ = 0, |
||||
rm.phix = TRUE, |
||||
compress = F, |
||||
multithread = F |
||||
) |
||||
|
||||
# How do the quality profiles look now? |
||||
# How many sequences were discarded? |
||||
# Which setting are the best, |
||||
# i.e. best compromise between achieving high quality and loosing too many sequences? |
||||
|
||||
|
||||
### Denoising #### |
||||
|
||||
# Error learning |
||||
errF <- learnErrors(filtFs, multithread = F, randomize = TRUE, verbose = 1, MAX_CONSIST = 10) |
||||
errR <- learnErrors(filtRs, multithread = F, randomize = TRUE, verbose = 1, MAX_CONSIST = 10) |
||||
|
||||
# inspect error profiles |
||||
plotErrors(errF, nominalQ = TRUE) |
||||
plotErrors(errR, nominalQ = TRUE) |
||||
|
||||
# What is your assessment of how well error rates are approximated? |
||||
# Deviations in which directions are more detrimental? |
||||
|
||||
# apply denoising |
||||
dadaFs <- dada(filtFs, err = errF, multithread = F, pool = "pseudo") |
||||
dadaRs <- dada(filtRs, err = errR, multithread = F, pool = "pseudo") |
||||
|
||||
# What is the ratio between unique and total number of reads? |
||||
# What is the meaning of the parameter 'pool'? |
||||
# How will different settings (TRUE, FALSE, pseudo) affect your results and downstream processing steps? |
||||
|
||||
|
||||
### Merged paired-end reads #### |
||||
|
||||
# aligned forward and reverse reads and stitch together based on overlap region |
||||
mergers <- mergePairs( |
||||
dadaFs, |
||||
filtFs, |
||||
dadaRs, |
||||
filtRs, |
||||
minOverlap = 10, |
||||
verbose = TRUE |
||||
) |
||||
|
||||
# create sequence table |
||||
seqtab <- makeSequenceTable(mergers) |
||||
|
||||
# How many amplicon sequence variants (ASVs) were detected in the data set? |
||||
|
||||
|
||||
### Chimera detection #### |
||||
|
||||
# identify ASVs that are hybrids of different templates |
||||
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = F, verbose = TRUE) |
||||
|
||||
# How many ASVs did survive? |
||||
# Which fraction of the community is kept? |
||||
|
||||
# Sequence length distribution |
||||
table(rep(nchar(colnames(seqtab.nochim)), colSums(seqtab.nochim))) |
||||
|
||||
# Are there any sequences that you would discard? |
||||
seqtab.nochim2 <- seqtab.nochim[, nchar(colnames(seqtab.nochim)) %in% seq(402, 431) & colSums(seqtab.nochim) > 1] |
||||
|
||||
# Why am I excluding ASVs only occurring once in the data set? |
||||
|
||||
# How many sequences survive the analysis workflow until now? |
||||
# How much of the initial input data was removed? |
||||
|
||||
|
||||
### Taxonomic classification #### |
||||
|
||||
# download the taxonomic reference: |
||||
# https://zenodo.org/record/4587955/files/silva_nr99_v138.1_train_set.fa.gz |
||||
|
||||
tax <- assignTaxonomy( |
||||
seqtab.nochim2, |
||||
"silva_nr99_v138.1_train_set.fa.gz", |
||||
multithread = F, |
||||
minBoot = 70, |
||||
outputBootstraps = F, |
||||
taxLevels = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus") |
||||
) |
||||
tax <- data.frame(tax) |
||||
|
||||
# Remove unwanted lineages |
||||
tax.good <- tax[tax$Kingdom == "Bacteria" & !grepl("[Cc]hloroplast", tax$Order) & !grepl("[Mm]itochondria", tax$Family), ] |
||||
|
||||
# How many ASVs are kept? |
||||
# Which fraction of the community was removed? |
||||
|
||||
# How many ASV as unclassified at each taxonomic level? |
||||
apply(tax.good, 2, function(x) sum(is.na(x))) |
||||
|
||||
# Quick-and-dirty solution to NAs in tax table |
||||
tax.clean <- tax.good |
||||
for(i in 2:ncol(tax.good)) { |
||||
tax.clean[, i] <- ifelse( |
||||
is.na(tax.clean[, i]), |
||||
paste0(gsub("_unclassified", "", tax.clean[, i - 1]), "_unclassified"), |
||||
tax.clean[, i] |
||||
) |
||||
} |
||||
|
||||
|
||||
### Format tables and write output #### |
||||
|
||||
# subset ASV table to classified sequence |
||||
seqtab.clean <- seqtab.nochim2[, rownames(tax.clean)] |
||||
|
||||
# format sequence names |
||||
seqtab.clean.print <- seqtab.clean |
||||
colnames(seqtab.clean.print) <- paste("asv", 1:ncol(seqtab.clean), sep = "_") |
||||
|
||||
# write files |
||||
# for easier readability, transpose ASV table (not strictly necessary, just out of convenience) |
||||
write.table( |
||||
t(seqtab.clean.print), |
||||
"asvtab.txt", |
||||
quote = F, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# save sequences in separate fasta file |
||||
uniquesToFasta(seqtab.clean, "seqs.fasta", ids = colnames(seqtab.clean.print)) |
||||
|
||||
# write taxonomy table |
||||
tax.clean.print <- tax.clean |
||||
rownames(tax.clean.print) <- colnames(seqtab.clean.print) |
||||
write.table( |
||||
tax.clean.print, |
||||
"taxtab.txt", |
||||
quote = F, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# also save reduced set of metadata parameters |
||||
write.table( |
||||
META[, c("lat", "lon", "sample_description")], |
||||
"metadata.txt", |
||||
quote = F, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# Don't forget to save your workspace |
||||
# Then remove all items from workspace and start fresh environment |
||||
|
||||
|
||||
### Read results of dada2 pipeline #### |
||||
|
||||
# ASV table |
||||
ASV <- read.table( |
||||
"asvtab.txt", |
||||
h = T, |
||||
sep = "\t", |
||||
row.names = 1 |
||||
) |
||||
|
||||
# taxonomy of microbial community |
||||
TAX <- read.table( |
||||
"taxtab.txt", |
||||
h = T, |
||||
sep = "\t", |
||||
row.names = 1, |
||||
comment.char = "", |
||||
quote = "" |
||||
) |
||||
|
||||
# sample metadata (experimental conditions) |
||||
META <- read.table( |
||||
"metadata.txt", |
||||
h = T, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# check data types |
||||
str(ASV) |
||||
str(TAX) |
||||
str(META) |
||||
|
||||
# make sure that objects are in the correct orientation/order |
||||
all.equal(rownames(META), colnames(ASV)) |
||||
all.equal(rownames(ASV), rownames(TAX)) |
||||
|
||||
# since the non-bloom stations are further west, we can use longitude to define colors according to bloom stage |
||||
META$color <- ifelse( |
||||
META$lon < -40, |
||||
"blue", |
||||
"green" |
||||
) |
||||
|
||||
# calculate proportions |
||||
ASV.rel <- prop.table(as.matrix(ASV), 2) * 100 |
||||
|
||||
# Why can't we use sequence counts directly for the further analysis? |
||||
|
||||
# summarize sequence counts at each taxonomic level |
||||
# create an empty list |
||||
TAX.pooled <- vector(mode = "list", length = 6) |
||||
names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus |
||||
|
||||
# fill elements of list with data frames with sequence counts per taxon, |
||||
# one taxonomic level at a time |
||||
for (i in 1:6) { |
||||
temp <- aggregate( |
||||
ASV, |
||||
by = list(TAX[, i]), |
||||
FUN = sum |
||||
) |
||||
rownames(temp) <- temp$Group.1 |
||||
TAX.pooled[[i]] <- as.matrix(temp[, -1]) |
||||
rm(temp) |
||||
} |
||||
|
||||
# calculate percentages for each taxonomic level using lapply |
||||
TAX.pooled.rel <- lapply( |
||||
TAX.pooled, |
||||
function(x) { |
||||
prop.table(x, 2) * 100 |
||||
} |
||||
) |
||||
View(TAX.pooled.rel$Phylum) |
||||
|
||||
|
||||
### rarefaction curves #### |
||||
|
||||
# we will use the iNEXT package to generate the input for the rarefaction curves |
||||
# for each sample, alpha diversity will be estimated at 250 different sequencing depths |
||||
# rarefaction curves will be drawn for Hill numbers 0, 1, 2 corresponding to: |
||||
# richness (0) |
||||
# exponential Shannon (1) |
||||
# inverse Simpson (2) |
||||
|
||||
# calculate diversity for each sample at each sequencing depth |
||||
inext_out <- lapply( |
||||
1:ncol(ASV), |
||||
function(x) { |
||||
iNEXT( |
||||
ASV[, x], |
||||
q = c(0, 1, 2), |
||||
datatype = "abundance", |
||||
endpoint = colSums(ASV)[x], |
||||
knots = 250, |
||||
se = F, |
||||
nboot = 5 |
||||
) |
||||
} |
||||
) |
||||
|
||||
# extract output in easier form for plotting |
||||
# for each hill number there will be 2 tables with x and y for the lines to plot |
||||
# for each sample (column) |
||||
inext_parsed <- lapply( |
||||
0:2, |
||||
function(y) { |
||||
list( |
||||
seq_depth = sapply(inext_out, function(x) { x$iNextEst$m[x$iNextEst$order == y] }), |
||||
alpha_div = sapply(inext_out, function(x) { x$iNextEst$qD[x$iNextEst$order == y] }) |
||||
) |
||||
} |
||||
) |
||||
|
||||
# create multi-panel plot |
||||
par(mfrow = c(3, 1), mar = c(1, 5, 1, 1), oma = c(4, 0, 0, 0)) |
||||
|
||||
# for each Hill number |
||||
for(i in 1:length(inext_parsed)) { |
||||
# create empty plot of correct dimensions |
||||
# already add y-axis label |
||||
plot( |
||||
0, 0, |
||||
type = "n", |
||||
xlim = c(0, max(inext_parsed[[i]]$seq_depth)), |
||||
ylim = c(0, max(inext_parsed[[i]]$alpha_div)), |
||||
xlab = "sequencing depth", |
||||
ylab = paste0("Alpha diversity (q = ", i - 1, ")"), |
||||
axes = F, |
||||
cex.lab = 1.3 |
||||
) |
||||
# draw box around plot |
||||
box("plot") |
||||
# add y-axis |
||||
axis(2, las = 1) |
||||
# add x-axis, but only label ticks in last plot |
||||
if(i == length(inext_parsed)) { |
||||
axis(1) |
||||
} else { |
||||
axis(1, labels = NA) |
||||
} |
||||
# for each sample, draw the rarefaction curve... |
||||
for(j in 1:ncol(inext_parsed[[i]]$seq_depth)) { |
||||
lines( |
||||
inext_parsed[[i]]$seq_depth[, j], |
||||
inext_parsed[[i]]$alpha_div[, j], |
||||
col = META$color[j] |
||||
) |
||||
# ... and add the sample name to the end of the curve |
||||
text( |
||||
max(inext_parsed[[i]]$seq_depth[, j]), |
||||
max(inext_parsed[[i]]$alpha_div[, j]), |
||||
col = META$color[j], |
||||
pos = 1, |
||||
labels = colnames(ASV)[j] |
||||
) |
||||
} |
||||
} |
||||
# add x-axis label |
||||
mtext(text = "Sequencing depth", side = 1, line = 2.5) |
||||
|
||||
# Are there differences in alpha diversity between samples? |
||||
# Which samples are more diverse and why? |
||||
# Did you observe similar trends in the incubation experiment (phytoplankton addition) based on SSCP? |
||||
|
||||
# What do these rarefaction curves tell you about how sufficient the sequencing was? |
||||
# Is it necessary to rarefy (subsample) the data set to compare alpha diversity between samples? |
||||
|
||||
|
||||
### Dissimilarity #### |
||||
|
||||
# Bray-Curtis dissimilarity |
||||
bc <- vegdist(t(ASV.rel), method = "bray") |
||||
bc |
||||
# Jaccard dissimilarity |
||||
jc <- vegdist(t(ASV.rel), method = "jaccard", binary = T) |
||||
jc |
||||
# How similar are the samples to each other? |
||||
# What is the difference in the interpretation between the 2 dissimilarity measures? |
||||
|
||||
|
||||
### taxonomic composition #### |
||||
|
||||
# Here we will take a shortcut to produce a stacked barplot, only showing the most dominant taxa |
||||
# The following function can be downloaded from: https://raw.githubusercontent.com/chassenr/NGS/master/Plotting/PlotAbund.R |
||||
# source("C:/Users/chassenrueck/Documents/Repos/NGS/Plotting/PlotAbund.R") |
||||
|
||||
PlotAbund( |
||||
TAX.pooled.rel$Genus, |
||||
abund = 5, |
||||
method = "percentage", |
||||
open.window = T, |
||||
plot.ratio = c(3.5, 1), |
||||
sort.taxa = T |
||||
) |
||||
|
||||
# Which taxa dominate the community at bloom and non-bloom stations? |
||||
# What is known about their ecology? |
||||
|
||||
# How are the changes in the taxonomic composition related to the observed patterns in alpha diversity? |
||||
|
||||
# How many different taxonomic groups are found at each level? |
||||
sapply(TAX.pooled.rel, nrow) |
||||
|
||||
# How many ASVs are not closely related to known taxa |
||||
round(sum(grepl("_unclassified", TAX$Genus))/nrow(TAX) * 100, 2) |
||||
|
||||
|
||||
### heatmap #### |
||||
|
||||
# select the 5 most dominant ASVs per sample |
||||
asv_select <- c( |
||||
"asv_1", |
||||
"asv_2", |
||||
"asv_3", |
||||
"asv_4", |
||||
"asv_5", |
||||
"asv_6", |
||||
"asv_7", |
||||
"asv_9", |
||||
"asv_10" |
||||
) |
||||
|
||||
asv_abund <- ASV.rel[asv_select, ] |
||||
|
||||
# Quick-and-dirty heatmap |
||||
heatmap.2( |
||||
asv_abund, |
||||
col = colorRampPalette(c("grey95", "darkred"))(50), |
||||
labRow = paste(TAX[asv_select, "genus"], asv_select), |
||||
breaks = seq(0, max(asv_abund), length.out = 51), |
||||
margins = c(10, 18), |
||||
trace = "none", |
||||
density.info = "none", |
||||
dendrogram = "none", |
||||
keysize = 1.3, |
||||
key.title = "", |
||||
cexCol = 1.5, |
||||
cexRow = 1, |
||||
key.xlab = "Sequence proportion [%]" |
||||
) |
||||
|
||||
# Further questions: |
||||
# What are the species/strain level taxonomic affiliations of the closest relatives of these sequences? |
||||
# What is the degree of novelty of these sequences? |
||||
# Where have similar sequences been found before? |
Loading…
Reference in new issue