Browse Source

adapted amplicon workflow for directed and mixed orientation libraries

master
parent
commit
25064ece01
  1. 2
      .gitignore
  2. 2
      Amplicon_dada2_MiSeq/Snakefile_classification
  3. 15
      Amplicon_dada2_MiSeq/Snakefile_denoising
  4. 71
      Amplicon_dada2_MiSeq/Snakefile_trimming
  5. 71
      Amplicon_dada2_MiSeq/config/config.yaml
  6. 71
      Amplicon_dada2_MiSeq/scripts/checksums.R
  7. 2
      Amplicon_dada2_MiSeq/scripts/classification.R
  8. 377
      Amplicon_dada2_MiSeq/scripts/denoising.R
  9. 120
      Amplicon_dada2_MiSeq/scripts/trimming.R

2
.gitignore vendored

@ -2,6 +2,8 @@ Amplicon_dada2_MiSeq/.snakemake
Amplicon_dada2_MiSeq/.snakemake/*
Amplicon_dada2_MiSeq/tests/
Amplicon_dada2_MiSeq/tests/*.txt
Amplicon_dada2_MiSeq/ref/
Amplicon_dada2_MiSeq/ref/*
metaG_Illumina_PE/.snakemake
metaG_Illumina_PE/.snakemake/*
metaG_Illumina_PE/slurm*

2
Amplicon_dada2_MiSeq/Snakefile_classification

@ -54,7 +54,7 @@ rule dada2_classification:
"""
if [[ "{params.plastid}" != "" ]]
then
{params.script} -n {params.project} -a {input.asvtab} -f {input.fasta} -b {params.min_boot} -t {params.train_set} -l "{params.lineage}" -p {params.plastid} -d {params.adir} -v {params.analysis} -c {threads} &>> {log}
{params.script} -n {params.project} -a {input.asvtab} -f {input.fasta} -b {params.min_boot} -t {params.train_set} -l "{params.lineage}" -o {params.plastid} -d {params.adir} -v {params.analysis} -c {threads} &>> {log}
else
{params.script} -n {params.project} -a {input.asvtab} -f {input.fasta} -b {params.min_boot} -t {params.train_set} -l "{params.lineage}" -d {params.adir} -v {params.analysis} -c {threads} &>> {log}
fi

15
Amplicon_dada2_MiSeq/Snakefile_denoising

@ -33,6 +33,7 @@ rule dada2_denoising:
errfun = config["errfun"],
rescue = config["rescue"],
prefix = config["asv_prefix"],
orientation = config["orientation"],
project = config["project"],
sample_list = config["sample_list"],
adir = config["adir"],
@ -47,8 +48,18 @@ rule dada2_denoising:
"""
if [[ "{params.rescue}" != "" ]]
then
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -r {params.rescue} -p {params.prefix} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
if [[ "{params.orientation}" == "mixed" ]]
then
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -r {params.rescue} -x {params.prefix} -o {params.orientation} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
else
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -r {params.rescue} -x {params.prefix} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
fi
else
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -p {params.prefix} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
if [[ "{params.orientation}" == "mixed" ]]
then
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -x {params.prefix} -o {params.orientation} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
else
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1} -L {params.truncLen_R2} -f {params.min_len} -F {params.max_len} -e {params.maxEE_R1} -E {params.maxEE_R2} -m {params.errfun} -x {params.prefix} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
fi
fi
"""

71
Amplicon_dada2_MiSeq/Snakefile_trimming

@ -31,15 +31,13 @@ def R1_fun(wildcards):
def R2_fun(wildcards):
return PATH_R2[wildcards.sample]
rule primer_clipping:
rule primer_clipping_fr:
input:
fq_R1 = R1_fun,
fq_R2 = R2_fun
output:
fq_fr_R1 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R1.fastq.gz",
fq_fr_R2 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R2.fastq.gz",
fq_rf_R1 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R1.fastq.gz",
fq_rf_R2 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R2.fastq.gz"
fq_fr_R2 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R2.fastq.gz"
params:
fwd = config["fwd"],
rev = config["rev"],
@ -49,14 +47,35 @@ rule primer_clipping:
conda:
config["wdir"] + "/envs/cutadapt.yaml"
log:
fr = config["adir"] + "/Intermediate_results/Logfiles/" + config["analysis"] + "/{sample}.clip_fr.log",
rf = config["adir"] + "/Intermediate_results/Logfiles/" + config["analysis"] + "/{sample}.clip_rf.log"
fr = config["adir"] + "/Intermediate_results/Logfiles/" + config["analysis"] + "/{sample}.clip_fr.log"
shell:
"""
cutadapt -j 1 --no-indels -e {params.error} -g "{params.fwd};o={params.ofwd}" -G "{params.rev};o={params.orev}" -m 50 --discard-untrimmed -o {output.fq_fr_R1} -p {output.fq_fr_R2} {input.fq_R1} {input.fq_R2} > {log.fr} 2>&1
cutadapt -j 1 --no-indels -e {params.error} -g "{params.rev};o={params.orev}" -G "{params.fwd};o={params.ofwd}" -m 50 --discard-untrimmed -o {output.fq_rf_R1} -p {output.fq_rf_R2} {input.fq_R1} {input.fq_R2} > {log.rf} 2>&1
"""
if config["orientation"] != "mixed":
rule primer_clipping_rf:
input:
fq_R1 = R1_fun,
fq_R2 = R2_fun
output:
fq_rf_R1 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R1.fastq.gz",
fq_rf_R2 = config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R2.fastq.gz"
params:
fwd = config["fwd"],
rev = config["rev"],
ofwd = config["ofwd"],
orev = config["orev"],
error = config["error"]
conda:
config["wdir"] + "/envs/cutadapt.yaml"
log:
rf = config["adir"] + "/Intermediate_results/Logfiles/" + config["analysis"] + "/{sample}.clip_rf.log"
shell:
"""
cutadapt -j 1 --no-indels -e {params.error} -g "{params.rev};o={params.orev}" -G "{params.fwd};o={params.ofwd}" -m 50 --discard-untrimmed -o {output.fq_rf_R1} -p {output.fq_rf_R2} {input.fq_R1} {input.fq_R2} > {log.rf} 2>&1
"""
rule count_seqs:
input:
clipped = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R1.fastq.gz", sample = SAMPLES)
@ -64,6 +83,7 @@ rule count_seqs:
nseq = config["adir"] + "/Intermediate_results/" + config["analysis"] + "/nseqs_" + config["project"] + ".txt"
params:
sample_list = config["sample_list"],
orientation = config["orientation"],
adir = config["adir"],
analysis = config["analysis"]
conda:
@ -72,11 +92,18 @@ rule count_seqs:
config["sample_threads"]
shell:
"""
cut -f2 {params.sample_list} | parallel -k -j {threads} "bzcat {{}} | awk 'END {{print NR/4}}'" | paste <(cut -f1 {params.sample_list}) - > {params.adir}/Intermediate_results/{params.analysis}/tmp1
cut -f1 {params.sample_list} | parallel -k -j {threads} "zcat {params.adir}/Validated_data/{params.analysis}/{{}}_clip_fr_R1.fastq.gz | awk 'END {{print NR/4}}'" | paste {params.adir}/Intermediate_results/{params.analysis}/tmp1 - > {params.adir}/Intermediate_results/{params.analysis}/tmp2
cut -f1 {params.sample_list} | parallel -k -j {threads} "zcat {params.adir}/Validated_data/{params.analysis}/{{}}_clip_rf_R1.fastq.gz | awk 'END {{print NR/4}}'" | paste {params.adir}/Intermediate_results/{params.analysis}/tmp2 - > {params.adir}/Intermediate_results/{params.analysis}/tmp3
echo -e 'SID\\tDemux\\tClipped_fr\\tClipped_rf' | cat - "{params.adir}/Intermediate_results/{params.analysis}/tmp3" > {output.nseq}
rm {params.adir}/Intermediate_results/{params.analysis}/tmp*
cut -f2 {params.sample_list} | parallel -k -j {threads} "bzcat {{}} | awk 'END {{print NR/4}}'" | paste <(cut -f1 {params.sample_list}) - > {params.adir}/Intermediate_results/{params.analysis}/tmp
cut -f1 {params.sample_list} | parallel -k -j {threads} "zcat {params.adir}/Validated_data/{params.analysis}/{{}}_clip_fr_R1.fastq.gz | awk 'END {{print NR/4}}'" | paste {params.adir}/Intermediate_results/{params.analysis}/tmp - > {params.adir}/Intermediate_results/{params.analysis}/tmp2
mv {params.adir}/Intermediate_results/{params.analysis}/tmp2 {params.adir}/Intermediate_results/{params.analysis}/tmp
if [[ "{params.orientation}" == "mixed" ]]
then
cut -f1 {params.sample_list} | parallel -k -j {threads} "zcat {params.adir}/Validated_data/{params.analysis}/{{}}_clip_rf_R1.fastq.gz | awk 'END {{print NR/4}}'" | paste {params.adir}/Intermediate_results/{params.analysis}/tmp - > {params.adir}/Intermediate_results/{params.analysis}/tmp2
mv {params.adir}/Intermediate_results/{params.analysis}/tmp2 {params.adir}/Intermediate_results/{params.analysis}/tmp
echo -e 'SID\\tDemux\\tClipped_fr\\tClipped_rf' | cat - "{params.adir}/Intermediate_results/{params.analysis}/tmp" > {output.nseq}
else
echo -e 'SID\\tDemux\\tClipped_fr' | cat - "{params.adir}/Intermediate_results/{params.analysis}/tmp" > {output.nseq}
fi
rm {params.adir}/Intermediate_results/{params.analysis}/tmp
"""
rule screen_filter_settings:
@ -93,6 +120,7 @@ rule screen_filter_settings:
fragment_max = config["fragment_max"],
maxEE_min = config["maxEE_min"],
maxEE_max = config["maxEE_max"],
orientation = config["orientation"],
project = config["project"],
sample_list = config["sample_list"],
adir = config["adir"],
@ -105,21 +133,27 @@ rule screen_filter_settings:
config["adir"] + "/Intermediate_results/Logfiles/" + config["analysis"] + "/trimming_R.log"
shell:
"""
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1_min} -L {params.truncLen_R1_max} -f {params.fragment_max} -e {params.maxEE_min} -E {params.maxEE_max} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
if [[ "{params.orientation}" == "mixed" ]]
then
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1_min} -L {params.truncLen_R1_max} -f {params.fragment_max} -e {params.maxEE_min} -E {params.maxEE_max} -o {params.orientation} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
else
{params.script} -n {params.project} -s {params.sample_list} -l {params.truncLen_R1_min} -L {params.truncLen_R1_max} -f {params.fragment_max} -e {params.maxEE_min} -E {params.maxEE_max} -d {params.adir} -a {params.analysis} -c {threads} &>> {log}
fi
"""
rule prepare_checksums:
input:
clipped_fr_R1 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R1.fastq.gz", sample = SAMPLES),
clipped_fr_R2 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_fr_R2.fastq.gz", sample = SAMPLES),
clipped_rf_R1 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R1.fastq.gz", sample = SAMPLES),
clipped_rf_R2 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R2.fastq.gz", sample = SAMPLES)
clipped_rf_R1 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R1.fastq.gz", sample = SAMPLES) if config["orientation"] else [],
clipped_rf_R2 = expand(config["adir"] + "/Validated_data/" + config["analysis"] + "/{sample}_clip_rf_R2.fastq.gz", sample = SAMPLES) if config["orientation"] else []
output:
checksums = config["adir"] + "/Intermediate_results/" + config["analysis"] + "/md5_checksums_" + config["project"] + ".txt"
params:
script = config["wdir"] + "/scripts/checksums.R",
project = config["project"],
sample_list = config["sample_list"],
orientation = config["orientation"],
adir = config["adir"],
analysis = config["analysis"]
conda:
@ -128,8 +162,11 @@ rule prepare_checksums:
"""
md5sum {input.clipped_fr_R1} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_fr_R1"
md5sum {input.clipped_fr_R2} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_fr_R2"
md5sum {input.clipped_rf_R1} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R1"
md5sum {input.clipped_rf_R2} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R2"
if [[ "{params.orientation}" == "mixed" ]]
then
md5sum {input.clipped_rf_R1} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R1"
md5sum {input.clipped_rf_R2} > "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R2"
fi
{params.script} -f "{params.adir}/Intermediate_results/{params.analysis}/tmp_fr_R1" -F "{params.adir}/Intermediate_results/{params.analysis}/tmp_fr_R2" -r "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R1" -R "{params.adir}/Intermediate_results/{params.analysis}/tmp_rf_R2" -s {params.sample_list} -o {output.checksums}
rm {params.adir}/Intermediate_results/{params.analysis}/tmp_*
"""

71
Amplicon_dada2_MiSeq/config/config.yaml

@ -3,43 +3,46 @@ project: "Test_dada2"
# The analysis ID is a way to further structure the data in Intermediate_results and Validated_data
# e.g. in cases where multiple analyses (16S, metaG) or multiple versions of an analysis are run within the same project
# analysis: "run1_16S"
analysis: "run1_18S"
analysis: "run2_16S"
# analysis: "run2_18S"
# Location of repository with analysis workflow
wdir: "/bio/Common_repositories/workflow_templates/Amplicon_dada2_MiSeq"
# Name of file with sample names and sequence file locations
# sample_list: "/bio/Common_repositories/workflow_templates/Amplicon_dada2_MiSeq/assets/sample_list.txt"
# sample_list: "/home/hassenru/Repos/IOWseq000006_warnow_salt/dada2_16S/sample_list_IOWseq000006.txt"
sample_list: "/home/hassenru/Repos/IOWseq000006_warnow_salt/dada2_18S/sample_list_IOWseq000006.txt"
sample_list: "/home/hassenru/Repos/IOWseq000006_warnow_salt/dada2_16S/sample_list_IOWseq000006.txt"
# sample_list: "/home/hassenru/Repos/IOWseq000006_warnow_salt/dada2_18S/sample_list_IOWseq000006.txt"
# Directory for analysis data
adir: "/bio/Analysis_data/lost_found/amplicon_wf_validation"
# sequence name prefix (e.g. asv_16s)
# asv_prefix: "asv_16s"
asv_prefix: "asv_18s"
asv_prefix: "asv_16s"
# asv_prefix: "asv_18s"
# library orientation. Specify mixed if R1 and R2 contain read from either orientation. Otherwise, set to empty string
# orientation: ""
orientation: "mixed"
# Default number of threads
threads: 1
# Number of threads for parallel processing of samples within dada2
sample_threads: 5
sample_threads: 60
# Number of threads for other parallel processing
parallel_threads: 60
# Primer sequences
# fwd: "^CCTAYGGGRBGCASCAG"
# rev: "^GGACTACNNGGGTATCTAAT"
# ofwd: 16
# orev: 19
fwd: "^CCAGCASCYGCGGTAATTCC"
rev: "^ACTTTCGTTCTTGATYRATGA"
ofwd: 19
orev: 20
fwd: "^CCTAYGGGRBGCASCAG"
rev: "^GGACTACNNGGGTATCTAAT"
ofwd: 16
orev: 19
# fwd: "^CCAGCASCYGCGGTAATTCC"
# rev: "^ACTTTCGTTCTTGATYRATGA"
# ofwd: 19
# orev: 20
# Allowed mismatches to primer
error: 0.16
@ -47,11 +50,11 @@ error: 0.16
# Filtering parameters: range for screening
# The truncLen for the R2 read will be calculated based on fragment_max - truncLen_R1 + 30
# maxEE will be incrementally increased with the R2 read lead, i.e. c(2, 2), c(2, 3), c(3, 3)
truncLen_R1_min: 250
truncLen_R1_min: 260
truncLen_R1_max: 270
fragment_max: 430
maxEE_min: 2
maxEE_max: 4
maxEE_max: 3
# Filtering parameters: final
truncLen_R1: 260
@ -60,34 +63,34 @@ maxEE_R1: 3
maxEE_R2: 3
# Expected fragment length range (all sequences outside this range will be discarded)
min_len: 304
max_len: 426
min_len: 380
max_len: 433
# Should the modified loess error function be used
# errfun: "default"
errfun: "mod"
errfun: "default"
# errfun: "mod"
# Should fragments that exceed maximum insert size for merging be 'rescued' by concatenation?
# Provide path to mapping reference for insert size calculation
# rescue: "/bio/Databases/SILVA/v138.1/SILVA_138.1_SSURef_NR99_tax_silva_trunc.fasta"
rescue: ""
rescue: "/bio/Databases/SILVA/v138.1/SILVA_138.1_SSURef_NR99_tax_silva_trunc.fasta"
# rescue: ""
# Location of silva training data set for dada2-based taxonomic classification
# ref_rdp: "/bio/Databases/SILVA/v138.1/dada2/silva_nr99_v138.1_train_set.fa.gz"
ref_rdp: "/bio/Databases/PR2/v4.14.0/dada2/pr2_version_4.14.0_SSU_dada2.fasta.gz"
ref_rdp: "/bio/Databases/SILVA/v138.1/dada2/silva_nr99_v138.1_train_set.fa.gz"
# ref_rdp: "/bio/Databases/PR2/v4.14.0/dada2/pr2_version_4.14.0_SSU_dada2.fasta.gz"
# Switch on organelle classification
# organelle_rdp: "/bio/Databases/PR2/v4.14.0/dada2/pr2_version_4.14.0_SSU_dada2.fasta.gz"
organelle_rdp: ""
organelle_rdp: "/bio/Databases/PR2/v4.14.0/dada2/pr2_version_4.14.0_SSU_dada2.fasta.gz"
# organelle_rdp: ""
# Bootstrap cut-off for taxonomic classification
min_boot: 70
# Location of specific reference DB for blast
# ref_blast: "/bio/Databases/SILVA/v138.1/blast/silva_138.1_nr99_blastdb"
ref_blast: "/bio/Databases/PR2/v4.14.0/blast/pr2_version_4.14.0_SSU_blastdb"
# organelle_blast: "/bio/Databases/PR2/v4.14.0/blast/pr2_version_4.14.0_SSU_blastdb"
organelle_blast: ""
ref_blast: "/bio/Databases/SILVA/v138.1/blast/silva_138.1_nr99_blastdb"
# ref_blast: "/bio/Databases/PR2/v4.14.0/blast/pr2_version_4.14.0_SSU_blastdb"
organelle_blast: "/bio/Databases/PR2/v4.14.0/blast/pr2_version_4.14.0_SSU_blastdb"
# organelle_blast: ""
# Location of NCBI NT reference database for megablast
ncbi: "/bio/Databases/NCBI_nt/20220409/blast/nt"
@ -104,6 +107,6 @@ silva_acc2tax: "/bio/Databases/SILVA/v138.1/silva_138.1_nr99_acc2tax.txt"
silva_taxrank: "/bio/Databases/SILVA/v138.1/tax_slv_ssu_138.1.txt"
# Domains to keep in the output
# lineage: "Bacteria,Archaea"
lineage: "Eukaryota"
lineage: "Bacteria,Archaea"
# lineage: "Eukaryota"

71
Amplicon_dada2_MiSeq/scripts/checksums.R

@ -119,42 +119,51 @@ if (is.null(opt$fr_R1) | is.null(opt$fr_R2) | is.null(opt$rf_R1) |
sample.names <- read.table(opt$sample_list, h = F, sep = "\t", stringsAsFactors = F)[, 1]
fr_R1 <- read.table(opt$fr_R1, h = F, stringsAsFactors = F)
fr_R2 <- read.table(opt$fr_R2, h = F, stringsAsFactors = F)
rf_R1 <- read.table(opt$rf_R1, h = F, stringsAsFactors = F)
rf_R2 <- read.table(opt$rf_R2, h = F, stringsAsFactors = F)
rownames(fr_R1) <- gsub("_clip_fr_R1.fastq.gz", "", basename(fr_R1$V2), fixed = T)
rownames(fr_R2) <- gsub("_clip_fr_R2.fastq.gz", "", basename(fr_R2$V2), fixed = T)
rownames(rf_R1) <- gsub("_clip_rf_R1.fastq.gz", "", basename(rf_R1$V2), fixed = T)
rownames(rf_R2) <- gsub("_clip_rf_R2.fastq.gz", "", basename(rf_R2$V2), fixed = T)
fr_R1 <- fr_R1[sample.names, ]
fr_R2 <- fr_R2[sample.names, ]
rf_R1 <- rf_R1[sample.names, ]
rf_R2 <- rf_R2[sample.names, ]
checksums <- data.frame(
seq_id = sample.names,
forward_read_file_name = paste0(
basename(fr_R1$V2),
";",
basename(rf_R1$V2)
),
forward_read_file_checksum = paste0(
basename(fr_R1$V1),
";",
basename(rf_R1$V1)
),
reverse_read_file_name = paste0(
basename(fr_R2$V2),
";",
basename(rf_R2$V2)
),
reverse_read_file_checksum = paste0(
basename(fr_R2$V1),
";",
basename(rf_R2$V1)
if(file.exists(opt$rf_R1)) {
rf_R1 <- read.table(opt$rf_R1, h = F, stringsAsFactors = F)
rf_R2 <- read.table(opt$rf_R2, h = F, stringsAsFactors = F)
rownames(rf_R1) <- gsub("_clip_rf_R1.fastq.gz", "", basename(rf_R1$V2), fixed = T)
rownames(rf_R2) <- gsub("_clip_rf_R2.fastq.gz", "", basename(rf_R2$V2), fixed = T)
rf_R1 <- rf_R1[sample.names, ]
rf_R2 <- rf_R2[sample.names, ]
checksums <- data.frame(
seq_id = sample.names,
forward_read_file_name = paste0(
basename(fr_R1$V2),
";",
basename(rf_R1$V2)
),
forward_read_file_checksum = paste0(
basename(fr_R1$V1),
";",
basename(rf_R1$V1)
),
reverse_read_file_name = paste0(
basename(fr_R2$V2),
";",
basename(rf_R2$V2)
),
reverse_read_file_checksum = paste0(
basename(fr_R2$V1),
";",
basename(rf_R2$V1)
)
)
)
} else {
checksums <- data.frame(
seq_id = sample.names,
forward_read_file_name = basename(fr_R1$V2),
forward_read_file_checksum = basename(fr_R1$V1),
reverse_read_file_name = basename(fr_R2$V2),
reverse_read_file_checksum = basename(fr_R2$V1)
)
}
write.table(
checksums,

2
Amplicon_dada2_MiSeq/scripts/classification.R

@ -106,7 +106,7 @@ option_list <- list(
metavar = "character"
),
make_option(
c("-p", "--plastid"),
c("-o", "--plastid"),
type = "character",
default = NULL,
help = "location of reference training data set for 16S chlorplast and mitochondria classification",

377
Amplicon_dada2_MiSeq/scripts/denoising.R

@ -153,12 +153,19 @@ option_list <- list(
metavar = "character"
),
make_option(
c("-p", "--prefix"),
c("-x", "--prefix"),
type = "character",
default = NULL,
help = "prefix to use for sequence header",
metavar = "character"
),
make_option(
c("-o", "--orientation"),
type = "character",
default = "fr",
help = "specify if libraries were generated in mixed orientation",
metavar = "character"
),
make_option(
c("-c", "--cpus"),
type = "integer",
@ -310,22 +317,28 @@ sample.names <- read.table(opt$samples, h = F, sep = "\t", stringsAsFactors = F)
path <- paste0(opt$dir, "/Validated_data/", opt$analysis)
fnFs.fr <- paste0(path, "/", sample.names, "_clip_fr_R1.fastq.gz")
fnRs.fr <- paste0(path, "/", sample.names, "_clip_fr_R2.fastq.gz")
fnFs.rf <- paste0(path, "/", sample.names, "_clip_rf_R1.fastq.gz")
fnRs.rf <- paste0(path, "/", sample.names, "_clip_rf_R2.fastq.gz")
names(fnFs.fr) <- sample.names
names(fnRs.fr) <- sample.names
names(fnFs.rf) <- sample.names
names(fnRs.rf) <- sample.names
if(opt$orientation == "mixed") {
fnFs.rf <- paste0(path, "/", sample.names, "_clip_rf_R1.fastq.gz")
fnRs.rf <- paste0(path, "/", sample.names, "_clip_rf_R2.fastq.gz")
names(fnFs.rf) <- sample.names
names(fnRs.rf) <- sample.names
}
# Place filtered files in Filtered/ subdirectory
filtFs.fr <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_fr_R1.fastq"))
filtRs.fr <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_fr_R2.fastq"))
filtFs.rf <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_rf_R1.fastq"))
filtRs.rf <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_rf_R2.fastq"))
names(filtFs.fr) <- sample.names
names(filtRs.fr) <- sample.names
names(filtFs.rf) <- sample.names
names(filtRs.rf) <- sample.names
if(opt$orientation == "mixed") {
filtFs.rf <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_rf_R1.fastq"))
filtRs.rf <- file.path(opt$dir, "Intermediate_results", opt$analysis, "Filtered", paste0(sample.names, "_filt_rf_R2.fastq"))
names(filtFs.rf) <- sample.names
names(filtRs.rf) <- sample.names
}
### Filter and trim reads with optimized parameters ####
@ -345,28 +358,39 @@ filt.out.fr <- filterAndTrim(
compress = F,
multithread = opt$cpus
)
filt.out.rf <- filterAndTrim(
fwd = fnFs.rf,
filt = filtFs.rf,
rev = fnRs.rf,
filt.rev = filtRs.rf,
truncLen = c(opt$truncLen_R1, opt$truncLen_R2),
maxN = 0,
minQ = 2,
maxEE = c(opt$error_R1, opt$error_R2),
truncQ = 0,
rm.phix = TRUE,
compress = F,
multithread = opt$cpus
)
if(opt$orientation == "mixed") {
filt.out.rf <- filterAndTrim(
fwd = fnFs.rf,
filt = filtFs.rf,
rev = fnRs.rf,
filt.rev = filtRs.rf,
truncLen = c(opt$truncLen_R1, opt$truncLen_R2),
maxN = 0,
minQ = 2,
maxEE = c(opt$error_R1, opt$error_R2),
truncQ = 0,
rm.phix = TRUE,
compress = F,
multithread = opt$cpus
)
}
# Repeat quality check after trimming
quality_check(
c(filtFs.fr, filtFs.rf),
c(filtRs.fr, filtRs.rf),
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfileFiltered_", opt$name),
cpus = opt$cpus
)
if(opt$orientation == "mixed") {
quality_check(
c(filtFs.fr, filtFs.rf),
c(filtRs.fr, filtRs.rf),
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfileFiltered_", opt$name),
cpus = opt$cpus
)
} else {
quality_check(
filtFs.fr,
filtRs.fr,
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfileFiltered_", opt$name),
cpus = opt$cpus
)
}
### Denoising ####
@ -375,13 +399,17 @@ quality_check(
if(opt$loess == "mod") {
errF.fr <- learnErrors(filtFs.fr, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.fr <- learnErrors(filtRs.fr, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errF.rf <- learnErrors(filtFs.rf, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.rf <- learnErrors(filtRs.rf, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
if(opt$orientation == "mixed") {
errF.rf <- learnErrors(filtFs.rf, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.rf <- learnErrors(filtRs.rf, errorEstimationFunction = loessErrfun2, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
}
} else {
errF.fr <- learnErrors(filtFs.fr, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.fr <- learnErrors(filtRs.fr, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errF.rf <- learnErrors(filtFs.rf, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.rf <- learnErrors(filtRs.rf, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
if(opt$orientation == "mixed") {
errF.rf <- learnErrors(filtFs.rf, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
errR.rf <- learnErrors(filtRs.rf, multithread = opt$cpus, randomize = TRUE, verbose = 1, MAX_CONSIST = 10)
}
}
save.image(paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/workspace_denoising_", opt$name, ".Rdata"))
@ -389,19 +417,25 @@ save.image(paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/workspace_d
pdf(paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/ErrorProfiles_", opt$name, ".pdf"))
barplot(log10(dada2:::checkConvergence(errF.fr) + 1), main = "Convergence_fwd")
barplot(log10(dada2:::checkConvergence(errR.fr) + 1), main = "Convergence_rev")
barplot(log10(dada2:::checkConvergence(errF.rf) + 1), main = "Convergence_fwd")
barplot(log10(dada2:::checkConvergence(errR.rf) + 1), main = "Convergence_rev")
if(opt$orientation == "mixed") {
barplot(log10(dada2:::checkConvergence(errF.rf) + 1), main = "Convergence_fwd")
barplot(log10(dada2:::checkConvergence(errR.rf) + 1), main = "Convergence_rev")
}
plotErrors(errF.fr, nominalQ = TRUE)
plotErrors(errR.fr, nominalQ = TRUE)
plotErrors(errF.rf, nominalQ = TRUE)
plotErrors(errR.rf, nominalQ = TRUE)
if(opt$orientation == "mixed") {
plotErrors(errF.rf, nominalQ = TRUE)
plotErrors(errR.rf, nominalQ = TRUE)
}
dev.off()
# Dereplicate and denoise samples
dadaFs.fr <- dada(filtFs.fr, err = errF.fr, multithread = opt$cpus, pool = TRUE)
dadaRs.fr <- dada(filtRs.fr, err = errR.fr, multithread = opt$cpus, pool = TRUE)
dadaFs.rf <- dada(filtFs.rf, err = errF.rf, multithread = opt$cpus, pool = TRUE)
dadaRs.rf <- dada(filtRs.rf, err = errR.rf, multithread = opt$cpus, pool = TRUE)
if(opt$orientation == "mixed") {
dadaFs.rf <- dada(filtFs.rf, err = errF.rf, multithread = opt$cpus, pool = TRUE)
dadaRs.rf <- dada(filtRs.rf, err = errR.rf, multithread = opt$cpus, pool = TRUE)
}
save.image(paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/workspace_denoising_", opt$name, ".Rdata"))
@ -417,15 +451,17 @@ mergers.fr0 <- mergePairs(
verbose = TRUE,
returnRejects = TRUE
)
mergers.rf0 <- mergePairs(
dadaFs.rf,
filtFs.rf,
dadaRs.rf,
filtRs.rf,
minOverlap = 10,
verbose = TRUE,
returnRejects = TRUE
)
if(opt$orientation == "mixed") {
mergers.rf0 <- mergePairs(
dadaFs.rf,
filtFs.rf,
dadaRs.rf,
filtRs.rf,
minOverlap = 10,
verbose = TRUE,
returnRejects = TRUE
)
}
# rescue unmerged
if(!is.null(opt$rescue)) {
@ -435,10 +471,12 @@ if(!is.null(opt$rescue)) {
writeFasta(unmerged.fr[[1]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Fs.fr.fasta"))
writeFasta(unmerged.fr[[2]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Rs.fr.fasta"))
unmerged.rf <- extract_unmerged(dadaFs.rf, dadaRs.rf, mergers.rf0)
writeFasta(unmerged.rf[[1]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Fs.rf.fasta"))
writeFasta(unmerged.rf[[2]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Rs.rf.fasta"))
if(opt$orientation == "mixed") {
unmerged.rf <- extract_unmerged(dadaFs.rf, dadaRs.rf, mergers.rf0)
writeFasta(unmerged.rf[[1]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Fs.rf.fasta"))
writeFasta(unmerged.rf[[2]], file = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Rs.rf.fasta"))
}
# mapping
system(
paste0(
@ -454,21 +492,23 @@ if(!is.null(opt$rescue)) {
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_unmerged.fr.bam"
)
)
system(
paste0(
"bbmap.sh threads=",
opt$cpus,
" ref=",
opt$rescue,
" in=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Fs.rf.fasta",
" in2=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Rs.rf.fasta",
" out=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_unmerged.rf.bam"
if(opt$orientation == "mixed") {
system(
paste0(
"bbmap.sh threads=",
opt$cpus,
" ref=",
opt$rescue,
" in=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Fs.rf.fasta",
" in2=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_Rs.rf.fasta",
" out=",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_unmerged.rf.bam"
)
)
)
}
# extract insert size
system(
paste0(
@ -478,15 +518,17 @@ if(!is.null(opt$rescue)) {
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.fr.txt"
)
)
system(
paste0(
"samtools view -F2304 -f66 -m50 ",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_unmerged.rf.bam",
" | cut -f1,9 > ",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.rf.txt"
if(opt$orientation == "mixed") {
system(
paste0(
"samtools view -F2304 -f66 -m50 ",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_unmerged.rf.bam",
" | cut -f1,9 > ",
opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.rf.txt"
)
)
)
}
# read insert sizes
is.fr <- read.table(
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.fr.txt"),
@ -494,19 +536,23 @@ if(!is.null(opt$rescue)) {
sep = "\t",
col.names = c("seqID", "insert")
)
is.rf <- read.table(
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.rf.txt"),
h = F,
sep = "\t",
col.names = c("seqID", "insert")
)
if(opt$orientation == "mixed") {
is.rf <- read.table(
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/unmerged_", opt$name, "_is.rf.txt"),
h = F,
sep = "\t",
col.names = c("seqID", "insert")
)
}
# filter to insert sizes that exceed maximum length of merged sequences
is_long.fr <- is.fr[is.fr$insert > (opt$truncLen_R1 + opt$truncLen_R2 - 10), ] %>%
separate(seqID, into = c("sample_index", "row_index"), sep = "_", remove = F, convert = T)
is_long.rf <- is.rf[-is.rf$insert > (opt$truncLen_R1 + opt$truncLen_R2 - 10), ] %>%
separate(seqID, into = c("sample_index", "row_index"), sep = "_", remove = F, convert = T)
if(opt$orientation == "mixed") {
is_long.rf <- is.rf[-is.rf$insert > (opt$truncLen_R1 + opt$truncLen_R2 - 10), ] %>%
separate(seqID, into = c("sample_index", "row_index"), sep = "_", remove = F, convert = T)
}
# retrieve and concatenate sequence
mergers.fr <- mergers.fr0
for(i in 1:length(mergers.fr)) {
@ -529,52 +575,60 @@ if(!is.null(opt$rescue)) {
mergers.fr[[i]] <- mergers.fr[[i]][mergers.fr[[i]]$accept, ]
}
}
mergers.rf <- mergers.rf0
for(i in 1:length(mergers.rf)) {
if(i %in% unique(is_long.rf$sample_index)) {
tmp_index <- is_long.rf$row_index[is_long.rf$sample_index == i]
if(length(tmp_index) > 0) {
mergers.rf[[i]]$sequence[tmp_index] <- paste0(
unmerged.rf[[1]][paste0(is_long.rf$seqID[is_long.rf$sample_index == i], "/1")],
"NNNNNNNNNN",
rc(unmerged.rf[[2]][paste0(is_long.rf$seqID[is_long.rf$sample_index == i], "/2")])
)
mergers.rf[[i]]$nmatch[tmp_index] <- 0
mergers.rf[[i]]$nmismatch[tmp_index] <- 0
mergers.rf[[i]]$nindel[tmp_index] <- 0
mergers.rf[[i]]$prefer[tmp_index] <- NA
mergers.rf[[i]]$accept[tmp_index] <- TRUE
if(opt$orientation == "mixed") {
mergers.rf <- mergers.rf0
for(i in 1:length(mergers.rf)) {
if(i %in% unique(is_long.rf$sample_index)) {
tmp_index <- is_long.rf$row_index[is_long.rf$sample_index == i]
if(length(tmp_index) > 0) {
mergers.rf[[i]]$sequence[tmp_index] <- paste0(
unmerged.rf[[1]][paste0(is_long.rf$seqID[is_long.rf$sample_index == i], "/1")],
"NNNNNNNNNN",
rc(unmerged.rf[[2]][paste0(is_long.rf$seqID[is_long.rf$sample_index == i], "/2")])
)
mergers.rf[[i]]$nmatch[tmp_index] <- 0
mergers.rf[[i]]$nmismatch[tmp_index] <- 0
mergers.rf[[i]]$nindel[tmp_index] <- 0
mergers.rf[[i]]$prefer[tmp_index] <- NA
mergers.rf[[i]]$accept[tmp_index] <- TRUE
mergers.rf[[i]] <- mergers.rf[[i]][mergers.rf[[i]]$accept, ]
}
} else {
mergers.rf[[i]] <- mergers.rf[[i]][mergers.rf[[i]]$accept, ]
}
} else {
mergers.rf[[i]] <- mergers.rf[[i]][mergers.rf[[i]]$accept, ]
}
}
} else {
mergers.fr <- lapply(mergers.fr0, function(x) x[x$accept, ])
mergers.rf <- lapply(mergers.rf0, function(x) x[x$accept, ])
if(opt$orientation == "mixed") {
mergers.rf <- lapply(mergers.rf0, function(x) x[x$accept, ])
}
}
# Create sequence table
seqtab.fr <- makeSequenceTable(mergers.fr)
seqtab.rf <- makeSequenceTable(mergers.rf)
msg(paste0("There are ", ncol(seqtab.fr), " ASVs in fr orientation, and ", ncol(seqtab.rf), " ASVs in rf orientation after merging.\n"))
# This is the step at which separate denoising runs should be combined
# (e.g. if data comes from different sequencer runs or lanes,
# or if fwd-rev and rev-fwd orientation were processed separately)
# Generate reverse complement of rf
seqtab.rf.rc <- seqtab.rf
colnames(seqtab.rf.rc) <- rc(colnames(seqtab.rf))
# Merge sequence tables
seqtab <- mergeSequenceTables(
seqtab.fr,
seqtab.rf.rc,
repeats = "sum"
)
msg(paste0("The combined fr and rf tables contain ", ncol(seqtab), " ASVs.\n"))
if(opt$orientation == "mixed") {
seqtab.rf <- makeSequenceTable(mergers.rf)
msg(paste0("There are ", ncol(seqtab.fr), " ASVs in fr orientation, and ", ncol(seqtab.rf), " ASVs in rf orientation after merging.\n"))
# This is the step at which separate denoising runs should be combined
# (e.g. if data comes from different sequencer runs or lanes,
# or if fwd-rev and rev-fwd orientation were processed separately)
# Generate reverse complement of rf
seqtab.rf.rc <- seqtab.rf
colnames(seqtab.rf.rc) <- rc(colnames(seqtab.rf))
# Merge sequence tables
seqtab <- mergeSequenceTables(
seqtab.fr,
seqtab.rf.rc,
repeats = "sum"
)
} else {
seqtab <- seqtab.fr
}
msg(paste0("The merged sequence table contains ", ncol(seqtab), " ASVs.\n"))
### Chimera removal and further cleaning steps ####
@ -617,36 +671,59 @@ nSeq <- read.table(paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/nse
nSeq <- nSeq[sample.names, ]
getN <- function(x) sum(getUniques(x))
track <- cbind(
nSeq$Demux,
filt.out.fr[, 1],
filt.out.rf[, 1],
filt.out.fr[, 2],
filt.out.rf[, 2],
sapply(dadaFs.fr, getN),
sapply(dadaRs.fr, getN),
sapply(dadaFs.rf, getN),
sapply(dadaRs.rf, getN),
sapply(mergers.fr, getN),
sapply(mergers.rf, getN),
rowSums(seqtab.nochim),
rowSums(seqtab.nochim2)
)
colnames(track) <- c(
"Demux",
"Clipped_fr",
"Clipped_rf",
"Filtered_fr",
"Filtered_rf",
"Denoised_fwd_fr",
"Denoised_rev_fr",
"Denoised_fwd_rf",
"Denoised_rev_rf",
"Merged_fr",
"Merged_rf",
"Nochim",
"Tabled"
)
if(opt$orientation == "mixed") {
track <- cbind(
nSeq$Demux,
filt.out.fr[, 1],
filt.out.rf[, 1],
filt.out.fr[, 2],
filt.out.rf[, 2],
sapply(dadaFs.fr, getN),
sapply(dadaRs.fr, getN),
sapply(dadaFs.rf, getN),
sapply(dadaRs.rf, getN),
sapply(mergers.fr, getN),
sapply(mergers.rf, getN),
rowSums(seqtab.nochim),
rowSums(seqtab.nochim2)
)
colnames(track) <- c(
"Demux",
"Clipped_fr",
"Clipped_rf",
"Filtered_fr",
"Filtered_rf",
"Denoised_fwd_fr",
"Denoised_rev_fr",
"Denoised_fwd_rf",
"Denoised_rev_rf",
"Merged_fr",
"Merged_rf",
"Nochim",
"Tabled"
)
} else {
track <- cbind(
nSeq$Demux,
filt.out.fr[, 1],
filt.out.fr[, 2],
sapply(dadaFs.fr, getN),
sapply(dadaRs.fr, getN),
sapply(mergers.fr, getN),
rowSums(seqtab.nochim),
rowSums(seqtab.nochim2)
)
colnames(track) <- c(
"Demux",
"Clipped_fr",
"Filtered_fr",
"Denoised_fwd_fr",
"Denoised_rev_fr",
"Merged_fr",
"Nochim",
"Tabled"
)
}
rownames(track) <- c(sample.names)
track <- data.frame(track)

120
Amplicon_dada2_MiSeq/scripts/trimming.R

@ -117,6 +117,13 @@ option_list <- list(
help = "maximum expected error rate (upper bound)",
metavar = "number"
),
make_option(
c("-o", "--orientation"),
type = "character",
default = "fr",
help = "specify if libraries were generated in mixed orientation",
metavar = "character"
),
make_option(
c("-d", "--dir"),
type = "character",
@ -276,22 +283,34 @@ msg(paste0("There are ", length(sample.names), " samples in the data set.\n"))
path <- paste0(opt$dir, "/Validated_data/", opt$analysis)
fnFs.fr <- paste0(path, "/", sample.names, "_clip_fr_R1.fastq.gz")
fnRs.fr <- paste0(path, "/", sample.names, "_clip_fr_R2.fastq.gz")
fnFs.rf <- paste0(path, "/", sample.names, "_clip_rf_R1.fastq.gz")
fnRs.rf <- paste0(path, "/", sample.names, "_clip_rf_R2.fastq.gz")
names(fnFs.fr) <- sample.names
names(fnRs.fr) <- sample.names
names(fnFs.rf) <- sample.names
names(fnRs.rf) <- sample.names
if(opt$orientation == "mixed") {
fnFs.rf <- paste0(path, "/", sample.names, "_clip_rf_R1.fastq.gz")
fnRs.rf <- paste0(path, "/", sample.names, "_clip_rf_R2.fastq.gz")
names(fnFs.rf) <- sample.names
names(fnRs.rf) <- sample.names
}
### Plot quality profiles ####
quality_check(
c(fnFs.fr, fnFs.rf),
c(fnRs.fr, fnRs.rf),
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfile_", opt$name),
cpus = opt$cpus
)
if(opt$orientation == "mixed") {
quality_check(
c(fnFs.fr, fnFs.rf),
c(fnRs.fr, fnRs.rf),
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfile_", opt$name),
cpus = opt$cpus
)
} else {
quality_check(
fnFs.fr,
fnRs.fr,
file_base = paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/QualityProfile_", opt$name),
cpus = opt$cpus
)
}
### Screen filtering parameters ####
@ -341,14 +360,16 @@ screen_filt_settings_fr <- screen_settings(
range_truncLen,
cpus = opt$cpus
)
screen_filt_settings_rf <- screen_settings(
sample.subset,
fnFs.rf[sample.subset],
fnRs.rf[sample.subset],
range_maxEE,
range_truncLen,
cpus = opt$cpus
)
if(opt$orientation == "mixed") {
screen_filt_settings_rf <- screen_settings(
sample.subset,
fnFs.rf[sample.subset],
fnRs.rf[sample.subset],
range_maxEE,
range_truncLen,
cpus = opt$cpus
)
}
# plot screening output
# This is just a gut feeling, but I would optimize for the following criteria:
@ -368,32 +389,47 @@ text(
pos = 2,
col = adjustcolor("black", alpha.f = 0.5)
)
plot(
screen_filt_settings_rf[, "prop_total"],
screen_filt_settings_rf[, "q90"] - screen_filt_settings_rf[, "q10"],
col = rep(rainbow(nrow(range_maxEE)), nrow(range_truncLen) - 1),
pch = 16
)
text(
screen_filt_settings_rf[, "prop_total"],
screen_filt_settings_rf[, "q90"] - screen_filt_settings_rf[, "q10"],
pos = 2,
col = adjustcolor("black", alpha.f = 0.5)
)
if(opt$orientation == "mixed") {
plot(
screen_filt_settings_rf[, "prop_total"],
screen_filt_settings_rf[, "q90"] - screen_filt_settings_rf[, "q10"],
col = rep(rainbow(nrow(range_maxEE)), nrow(range_truncLen) - 1),
pch = 16
)
text(
screen_filt_settings_rf[, "prop_total"],
screen_filt_settings_rf[, "q90"] - screen_filt_settings_rf[, "q10"],
pos = 2,
col = adjustcolor("black", alpha.f = 0.5)
)
}
dev.off()
# write screening output to file
msg("Writing output.\n")
write.table(
data.frame(
index = c(1:nrow(screen_filt_settings_fr), 1:nrow(screen_filt_settings_rf)),
orientation = c(rep("fr", nrow(screen_filt_settings_fr)), rep("rf", nrow(screen_filt_settings_rf))),
rbind(screen_filt_settings_fr, screen_filt_settings_rf)
),
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/Parameter_screening_", opt$name, ".txt"),
sep = "\t",
quote = F,
row.names = F
)
if(opt$orientation == "mixed") {
write.table(
data.frame(
index = c(1:nrow(screen_filt_settings_fr), 1:nrow(screen_filt_settings_rf)),
orientation = c(rep("fr", nrow(screen_filt_settings_fr)), rep("rf", nrow(screen_filt_settings_rf))),
rbind(screen_filt_settings_fr, screen_filt_settings_rf)
),
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/Parameter_screening_", opt$name, ".txt"),
sep = "\t",
quote = F,
row.names = F
)
} else {
write.table(
data.frame(
index = 1:nrow(screen_filt_settings_fr),
orientation = rep("fr", nrow(screen_filt_settings_fr)),
screen_filt_settings_fr
),
paste0(opt$dir, "/Intermediate_results/", opt$analysis, "/Parameter_screening_", opt$name, ".txt"),
sep = "\t",
quote = F,
row.names = F
)
}

Loading…
Cancel
Save