5 changed files with 363 additions and 0 deletions
@ -0,0 +1,309 @@
|
||||
################################# |
||||
# Analysis of 16S amplicon data # |
||||
################################# |
||||
|
||||
### prepare environment #### |
||||
|
||||
# set working directory |
||||
setwd("C:/Users/chassenrueck/Documents/IOW_home/Teaching/Practical_Jürgens_Kremp_2022/bioinf/example_data/") |
||||
|
||||
# load packages |
||||
# if not currently installed, |
||||
# click on packages (next to the plot tab in the bottom right pane) |
||||
# then, there is a install button |
||||
# or run: install.packages("vegan") |
||||
require(vegan) |
||||
require(iNEXT) |
||||
require(gplots) |
||||
require(dada2) # Hint: this package is available on bioconductor |
||||
|
||||
# loading and saving workspace |
||||
# if you saved the workspace from the R intro, you can continue working with this one here |
||||
load("practical.Rdata") |
||||
# save.image("practical.Rdata") |
||||
|
||||
|
||||
### per base quality profile #### |
||||
|
||||
# Using the function plotQualityProfile("file_name.fastq") |
||||
# show the per base quality of the sequencing data. |
||||
# What is your impression of the sequence quality? Is it sufficient? |
||||
|
||||
|
||||
### read data #### |
||||
|
||||
# ASV table |
||||
ASV <- read.table( |
||||
"for_students/asvtab.txt", |
||||
h = T, |
||||
sep = "\t", |
||||
row.names = 1 |
||||
) |
||||
|
||||
# taxonomy of microbial community |
||||
TAX <- read.table( |
||||
"for_students/taxtab.txt", |
||||
h = T, |
||||
sep = "\t", |
||||
row.names = 1, |
||||
comment.char = "", |
||||
quote = "" |
||||
) |
||||
|
||||
# sample metadata (experimental conditions) |
||||
META <- read.table( |
||||
"results_read_run_tsv.txt", |
||||
h = T, |
||||
sep = "\t" |
||||
) |
||||
|
||||
# check data types |
||||
str(ASV) |
||||
str(TAX) |
||||
str(META) |
||||
|
||||
# There are a lot of columns in META that we don't really need for this exercise. |
||||
# Let's only pick those which may be relevant later: |
||||
ena_select_columns <- c( |
||||
"collection_date", |
||||
"country", |
||||
"depth", |
||||
"description", |
||||
"environment_biome", |
||||
"environment_feature", |
||||
"environment_material", |
||||
"lat", |
||||
"library_source", |
||||
"location", |
||||
"lon", |
||||
"run_accession", |
||||
"sample_description", |
||||
"sample_title", |
||||
"target_gene" |
||||
) |
||||
META <- META[, ena_select_columns] |
||||
|
||||
# There are also 270 data sets in the metadata table, only 4 of which will be used here. |
||||
# Be aware that each sample title occurs twice, since taxonomic composition was assessed on both |
||||
# RNA and DNA level in the same sample. We need the DNA (library source = METAGENOMIC) data here. |
||||
META <- META[META$library_source == "METAGENOMIC" & META$sample_title %in% colnames(ASV),] |
||||
|
||||
# Now we can set the sample title as row names |
||||
rownames(META) <- META$sample_title |
||||
|
||||
# adjust object and data types |
||||
ASV <- as.matrix(ASV) |
||||
|
||||
# reorder data |
||||
# since the non-bloom stations are further west, we can reorder by longitude |
||||
META <- META[order(META$lon), ] |
||||
ASV <- ASV[, rownames(META)] |
||||
|
||||
# check that table are in the correct order |
||||
all.equal(rownames(ASV), rownames(TAX)) |
||||
all.equal(colnames(ASV), rownames(META)) |
||||
|
||||
# create color scheme according to bloom stage |
||||
META$color <- c("blue", "blue", "green", "green") |
||||
|
||||
# calculate proportions |
||||
ASV.rel <- prop.table(ASV, 2) * 100 |
||||
|
||||
# summarize sequence counts at each taxonomic level |
||||
# create an empty list |
||||
TAX.pooled <- vector(mode = "list", length = 6) |
||||
names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus |
||||
|
||||
# fill elements of list with data frames with sequence counts per taxon, |
||||
# one taxonomic level at a time |
||||
for (i in 1:6) { |
||||
temp <- aggregate( |
||||
ASV, |
||||
by = list(TAX[, i]), |
||||
FUN = sum |
||||
) |
||||
rownames(temp) <- temp$Group.1 |
||||
TAX.pooled[[i]] <- as.matrix(temp[, -1]) |
||||
rm(temp) |
||||
} |
||||
|
||||
# calculate percentages for each taxonomic level using lapply |
||||
TAX.pooled.rel <- lapply( |
||||
TAX.pooled, |
||||
function(x) { |
||||
prop.table(x, 2) * 100 |
||||
} |
||||
) |
||||
|
||||
|
||||
### rarefaction curves #### |
||||
|
||||
# we will use the iNEXT package to generate the input for the rarefaction curves |
||||
# for each sample, alpha diversity will be estimated at 250 different sequencing depths |
||||
# rarefaction curves will be drawn for Hill numbers 0, 1, 2 corresponding to: |
||||
# richness (0) |
||||
# exponential Shannon (1) |
||||
# inverse Simpson (2) |
||||
|
||||
# calculate diversity for each sample at each sequencing depth |
||||
inext_out <- lapply( |
||||
1:ncol(ASV), |
||||
function(x) { |
||||
iNEXT( |
||||
ASV[, x], |
||||
q = c(0, 1, 2), |
||||
datatype = "abundance", |
||||
endpoint = colSums(ASV)[x], |
||||
knots = 250, |
||||
se = F, |
||||
nboot = 5 |
||||
) |
||||
} |
||||
) |
||||
|
||||
# extract output in easier form for plotting |
||||
# for each hill number there will be 2 tables with x and y for the lines to plot |
||||
# for each sample (column) |
||||
inext_parsed <- lapply( |
||||
0:2, |
||||
function(y) { |
||||
list( |
||||
seq_depth = sapply(inext_out, function(x) { x$iNextEst$m[x$iNextEst$order == y] }), |
||||
alpha_div = sapply(inext_out, function(x) { x$iNextEst$qD[x$iNextEst$order == y] }) |
||||
) |
||||
} |
||||
) |
||||
|
||||
# create multi-panel plot |
||||
par(mfrow = c(3, 1), mar = c(1, 5, 1, 1), oma = c(4, 0, 0, 0)) |
||||
|
||||
# for each Hill number |
||||
for(i in 1:length(inext_parsed)) { |
||||
# create empty plot of correct dimensions |
||||
# already add y-axis label |
||||
plot( |
||||
0, 0, |
||||
type = "n", |
||||
xlim = c(0, max(inext_parsed[[i]]$seq_depth)), |
||||
ylim = c(0, max(inext_parsed[[i]]$alpha_div)), |
||||
xlab = "sequencing depth", |
||||
ylab = paste0("Alpha diversity (q = ", i - 1, ")"), |
||||
axes = F, |
||||
cex.lab = 1.3 |
||||
) |
||||
# draw box around plot |
||||
box("plot") |
||||
# add y-axis |
||||
axis(2, las = 1) |
||||
# add x-axis, but only label ticks in last plot |
||||
if(i == length(inext_parsed)) { |
||||
axis(1) |
||||
} else { |
||||
axis(1, labels = NA) |
||||
} |
||||
# for each sample, draw the rarefaction curve... |
||||
for(j in 1:ncol(inext_parsed[[i]]$seq_depth)) { |
||||
lines( |
||||
inext_parsed[[i]]$seq_depth[, j], |
||||
inext_parsed[[i]]$alpha_div[, j], |
||||
col = META$color[j] |
||||
) |
||||
# ... and add the sample name to the end of the curve |
||||
text( |
||||
max(inext_parsed[[i]]$seq_depth[, j]), |
||||
max(inext_parsed[[i]]$alpha_div[, j]), |
||||
col = META$color[j], |
||||
pos = 1, |
||||
labels = colnames(ASV)[j] |
||||
) |
||||
} |
||||
} |
||||
# add x-axis label |
||||
mtext(text = "Sequencing depth", side = 1, line = 2.5) |
||||
|
||||
# Are there differences in alpha diversity between samples? |
||||
# Which samples are more diverse and why? |
||||
# Did you observe similar trends in the incubation experiment (phytoplankton addition) based on SSCP? |
||||
|
||||
# What do these rarefaction curves tell you about how sufficient the sequencing was? |
||||
# Is it necessary to rarefy (subsample) the data set to compare alpha diversity between samples? |
||||
|
||||
|
||||
### Dissimilarity #### |
||||
|
||||
# Bray-Curtis dissimilarity |
||||
bc <- vegdist(t(ASV.rel), method = "bray") |
||||
|
||||
# Jaccard dissimilarity |
||||
jc <- vegdist(t(ASV.rel), method = "jaccard", binary = T) |
||||
|
||||
# How similar are the samples to each other? |
||||
# What is the difference in the interpretation between the 2 dissimilarity measures? |
||||
|
||||
|
||||
### taxonomic composition #### |
||||
|
||||
# Here we will take a shortcut to produce a stacked barplot, only showing the most dominant taxa |
||||
# The following function can be downloaded from: https://raw.githubusercontent.com/chassenr/NGS/master/Plotting/PlotAbund.R |
||||
source("C:/Users/chassenrueck/Documents/Repos/NGS/Plotting/PlotAbund.R") |
||||
|
||||
PlotAbund( |
||||
TAX.pooled.rel$family, |
||||
abund = 5, |
||||
method = "percentage", |
||||
open.window = T, |
||||
plot.ratio = c(3.5, 1), |
||||
sort.taxa = T |
||||
) |
||||
|
||||
# Which taxa dominate the community at bloom and non-bloom stations? |
||||
# What is known about their ecology? |
||||
|
||||
# How are the changes in the taxonomic composition related to the observed patterns in alpha diversity? |
||||
|
||||
# How many different taxonomic groups are found at each level? |
||||
sapply(TAX.pooled.rel, nrow) |
||||
|
||||
# How many ASVs are not closely related to known taxa |
||||
round(sum(grepl("_unclassified", TAX$genus))/nrow(TAX) * 100, 200) |
||||
|
||||
|
||||
### heatmap #### |
||||
|
||||
# select the 5 most dominant ASVs per sample |
||||
asv_select <- c( |
||||
"asv_16s_1", |
||||
"asv_16s_2", |
||||
"asv_16s_3", |
||||
"asv_16s_4", |
||||
"asv_16s_5", |
||||
"asv_16s_6", |
||||
"asv_16s_8", |
||||
"asv_16s_9", |
||||
"asv_16s_10", |
||||
"asv_16s_11" |
||||
) |
||||
|
||||
asv_abund <- ASV.rel[asv_select, ] |
||||
|
||||
# Quick-and-dirty heatmap |
||||
heatmap.2( |
||||
asv_abund, |
||||
col = colorRampPalette(c("grey95", "darkred"))(50), |
||||
labRow = paste(TAX[asv_select, "genus"], asv_select), |
||||
breaks = seq(0, max(asv_abund), length.out = 51), |
||||
margins = c(10, 18), |
||||
trace = "none", |
||||
density.info = "none", |
||||
dendrogram = "none", |
||||
keysize = 1.3, |
||||
key.title = "", |
||||
cexCol = 1.5, |
||||
cexRow = 1, |
||||
key.xlab = "Sequence proportion [%]" |
||||
) |
||||
|
||||
# Further questions: |
||||
# What are the species/strain level taxonomic affiliations of the closest relatives of these sequences? |
||||
# What is the degree of novelty of these sequences? |
||||
# Where have similar sequences been found before? |
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
@ -0,0 +1,53 @@
|
||||
# I'm a huge section heading |
||||
some text |
||||
## I'm a large section heading |
||||
some text |
||||
### I'm a medium section heading |
||||
some text |
||||
#### I'm a small section heading |
||||
some text |
||||
#### I'm a small section heading |
||||
some text |
||||
##### I do not differ in size anymore |
||||
some text |
||||
|
||||
# Iterations/Enumerations |
||||
* an item |
||||
* another item |
||||
|
||||
1. first item |
||||
2. second item |
||||
|
||||
# Paragraphs and line breaks |
||||
If you want to start a new paragraph... |
||||
|
||||
... leave an empty line. |
||||
|
||||
To force a line break (empty spaces are inserted after this) |
||||
add two empty spaces in the end of the line. |
||||
|
||||
# Coding |
||||
Let's use some `inline code`. |
||||
|
||||
Or do it blockwise: |
||||
``` |
||||
def(arg): |
||||
return arg+1 |
||||
``` |
||||
|
||||
# Tables |
||||
|
||||
| id | col1 | col2 | |
||||
|-----|------|------| |
||||
| 1 | hs | 123 | |
||||
| 2 | ts | 321 | |
||||
|
||||
# Images |
||||
 |
||||
|
||||
# Links |
||||
[I'm a web link to the git homepage](https://git-scm.com/) |
||||
|
||||
[I'm a link to another file](anotherfile.md) |
||||
|
||||
|
Loading…
Reference in new issue