Browse Source

added git intro and script for microbio practical 2022

master
chassenr 11 months ago
parent
commit
6f8636df27
  1. 309
      Microbio_practical_2022/R_script_practical.R
  2. BIN
      git_intro_2022/22_05_14_iow.pdf
  3. 1
      git_intro_2022/markdown/anotherfile.md
  4. BIN
      git_intro_2022/markdown/img.png
  5. 53
      git_intro_2022/markdown/markdown.md

309
Microbio_practical_2022/R_script_practical.R

@ -0,0 +1,309 @@
#################################
# Analysis of 16S amplicon data #
#################################
### prepare environment ####
# set working directory
setwd("C:/Users/chassenrueck/Documents/IOW_home/Teaching/Practical_Jürgens_Kremp_2022/bioinf/example_data/")
# load packages
# if not currently installed,
# click on packages (next to the plot tab in the bottom right pane)
# then, there is a install button
# or run: install.packages("vegan")
require(vegan)
require(iNEXT)
require(gplots)
require(dada2) # Hint: this package is available on bioconductor
# loading and saving workspace
# if you saved the workspace from the R intro, you can continue working with this one here
load("practical.Rdata")
# save.image("practical.Rdata")
### per base quality profile ####
# Using the function plotQualityProfile("file_name.fastq")
# show the per base quality of the sequencing data.
# What is your impression of the sequence quality? Is it sufficient?
### read data ####
# ASV table
ASV <- read.table(
"for_students/asvtab.txt",
h = T,
sep = "\t",
row.names = 1
)
# taxonomy of microbial community
TAX <- read.table(
"for_students/taxtab.txt",
h = T,
sep = "\t",
row.names = 1,
comment.char = "",
quote = ""
)
# sample metadata (experimental conditions)
META <- read.table(
"results_read_run_tsv.txt",
h = T,
sep = "\t"
)
# check data types
str(ASV)
str(TAX)
str(META)
# There are a lot of columns in META that we don't really need for this exercise.
# Let's only pick those which may be relevant later:
ena_select_columns <- c(
"collection_date",
"country",
"depth",
"description",
"environment_biome",
"environment_feature",
"environment_material",
"lat",
"library_source",
"location",
"lon",
"run_accession",
"sample_description",
"sample_title",
"target_gene"
)
META <- META[, ena_select_columns]
# There are also 270 data sets in the metadata table, only 4 of which will be used here.
# Be aware that each sample title occurs twice, since taxonomic composition was assessed on both
# RNA and DNA level in the same sample. We need the DNA (library source = METAGENOMIC) data here.
META <- META[META$library_source == "METAGENOMIC" & META$sample_title %in% colnames(ASV),]
# Now we can set the sample title as row names
rownames(META) <- META$sample_title
# adjust object and data types
ASV <- as.matrix(ASV)
# reorder data
# since the non-bloom stations are further west, we can reorder by longitude
META <- META[order(META$lon), ]
ASV <- ASV[, rownames(META)]
# check that table are in the correct order
all.equal(rownames(ASV), rownames(TAX))
all.equal(colnames(ASV), rownames(META))
# create color scheme according to bloom stage
META$color <- c("blue", "blue", "green", "green")
# calculate proportions
ASV.rel <- prop.table(ASV, 2) * 100
# summarize sequence counts at each taxonomic level
# create an empty list
TAX.pooled <- vector(mode = "list", length = 6)
names(TAX.pooled) <- colnames(TAX)[1:6] # domain to genus
# fill elements of list with data frames with sequence counts per taxon,
# one taxonomic level at a time
for (i in 1:6) {
temp <- aggregate(
ASV,
by = list(TAX[, i]),
FUN = sum
)
rownames(temp) <- temp$Group.1
TAX.pooled[[i]] <- as.matrix(temp[, -1])
rm(temp)
}
# calculate percentages for each taxonomic level using lapply
TAX.pooled.rel <- lapply(
TAX.pooled,
function(x) {
prop.table(x, 2) * 100
}
)
### rarefaction curves ####
# we will use the iNEXT package to generate the input for the rarefaction curves
# for each sample, alpha diversity will be estimated at 250 different sequencing depths
# rarefaction curves will be drawn for Hill numbers 0, 1, 2 corresponding to:
# richness (0)
# exponential Shannon (1)
# inverse Simpson (2)
# calculate diversity for each sample at each sequencing depth
inext_out <- lapply(
1:ncol(ASV),
function(x) {
iNEXT(
ASV[, x],
q = c(0, 1, 2),
datatype = "abundance",
endpoint = colSums(ASV)[x],
knots = 250,
se = F,
nboot = 5
)
}
)
# extract output in easier form for plotting
# for each hill number there will be 2 tables with x and y for the lines to plot
# for each sample (column)
inext_parsed <- lapply(
0:2,
function(y) {
list(
seq_depth = sapply(inext_out, function(x) { x$iNextEst$m[x$iNextEst$order == y] }),
alpha_div = sapply(inext_out, function(x) { x$iNextEst$qD[x$iNextEst$order == y] })
)
}
)
# create multi-panel plot
par(mfrow = c(3, 1), mar = c(1, 5, 1, 1), oma = c(4, 0, 0, 0))
# for each Hill number
for(i in 1:length(inext_parsed)) {
# create empty plot of correct dimensions
# already add y-axis label
plot(
0, 0,
type = "n",
xlim = c(0, max(inext_parsed[[i]]$seq_depth)),
ylim = c(0, max(inext_parsed[[i]]$alpha_div)),
xlab = "sequencing depth",
ylab = paste0("Alpha diversity (q = ", i - 1, ")"),
axes = F,
cex.lab = 1.3
)
# draw box around plot
box("plot")
# add y-axis
axis(2, las = 1)
# add x-axis, but only label ticks in last plot
if(i == length(inext_parsed)) {
axis(1)
} else {
axis(1, labels = NA)
}
# for each sample, draw the rarefaction curve...
for(j in 1:ncol(inext_parsed[[i]]$seq_depth)) {
lines(
inext_parsed[[i]]$seq_depth[, j],
inext_parsed[[i]]$alpha_div[, j],
col = META$color[j]
)
# ... and add the sample name to the end of the curve
text(
max(inext_parsed[[i]]$seq_depth[, j]),
max(inext_parsed[[i]]$alpha_div[, j]),
col = META$color[j],
pos = 1,
labels = colnames(ASV)[j]
)
}
}
# add x-axis label
mtext(text = "Sequencing depth", side = 1, line = 2.5)
# Are there differences in alpha diversity between samples?
# Which samples are more diverse and why?
# Did you observe similar trends in the incubation experiment (phytoplankton addition) based on SSCP?
# What do these rarefaction curves tell you about how sufficient the sequencing was?
# Is it necessary to rarefy (subsample) the data set to compare alpha diversity between samples?
### Dissimilarity ####
# Bray-Curtis dissimilarity
bc <- vegdist(t(ASV.rel), method = "bray")
# Jaccard dissimilarity
jc <- vegdist(t(ASV.rel), method = "jaccard", binary = T)
# How similar are the samples to each other?
# What is the difference in the interpretation between the 2 dissimilarity measures?
### taxonomic composition ####
# Here we will take a shortcut to produce a stacked barplot, only showing the most dominant taxa
# The following function can be downloaded from: https://raw.githubusercontent.com/chassenr/NGS/master/Plotting/PlotAbund.R
source("C:/Users/chassenrueck/Documents/Repos/NGS/Plotting/PlotAbund.R")
PlotAbund(
TAX.pooled.rel$family,
abund = 5,
method = "percentage",
open.window = T,
plot.ratio = c(3.5, 1),
sort.taxa = T
)
# Which taxa dominate the community at bloom and non-bloom stations?
# What is known about their ecology?
# How are the changes in the taxonomic composition related to the observed patterns in alpha diversity?
# How many different taxonomic groups are found at each level?
sapply(TAX.pooled.rel, nrow)
# How many ASVs are not closely related to known taxa
round(sum(grepl("_unclassified", TAX$genus))/nrow(TAX) * 100, 200)
### heatmap ####
# select the 5 most dominant ASVs per sample
asv_select <- c(
"asv_16s_1",
"asv_16s_2",
"asv_16s_3",
"asv_16s_4",
"asv_16s_5",
"asv_16s_6",
"asv_16s_8",
"asv_16s_9",
"asv_16s_10",
"asv_16s_11"
)
asv_abund <- ASV.rel[asv_select, ]
# Quick-and-dirty heatmap
heatmap.2(
asv_abund,
col = colorRampPalette(c("grey95", "darkred"))(50),
labRow = paste(TAX[asv_select, "genus"], asv_select),
breaks = seq(0, max(asv_abund), length.out = 51),
margins = c(10, 18),
trace = "none",
density.info = "none",
dendrogram = "none",
keysize = 1.3,
key.title = "",
cexCol = 1.5,
cexRow = 1,
key.xlab = "Sequence proportion [%]"
)
# Further questions:
# What are the species/strain level taxonomic affiliations of the closest relatives of these sequences?
# What is the degree of novelty of these sequences?
# Where have similar sequences been found before?

BIN
git_intro_2022/22_05_14_iow.pdf

Binary file not shown.

1
git_intro_2022/markdown/anotherfile.md

@ -0,0 +1 @@
# I'm another file

BIN
git_intro_2022/markdown/img.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

53
git_intro_2022/markdown/markdown.md

@ -0,0 +1,53 @@
# I'm a huge section heading
some text
## I'm a large section heading
some text
### I'm a medium section heading
some text
#### I'm a small section heading
some text
#### I'm a small section heading
some text
##### I do not differ in size anymore
some text
# Iterations/Enumerations
* an item
* another item
1. first item
2. second item
# Paragraphs and line breaks
If you want to start a new paragraph...
... leave an empty line.
To force a line break (empty spaces are inserted after this)
add two empty spaces in the end of the line.
# Coding
Let's use some `inline code`.
Or do it blockwise:
```
def(arg):
return arg+1
```
# Tables
| id | col1 | col2 |
|-----|------|------|
| 1 | hs | 123 |
| 2 | ts | 321 |
# Images
![](img.png)
# Links
[I'm a web link to the git homepage](https://git-scm.com/)
[I'm a link to another file](anotherfile.md)
Loading…
Cancel
Save