From 66d4a68732413cd95f56ca16262b3c008d5a304a Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Fri, 30 Apr 2021 14:23:06 +0200
Subject: [PATCH 01/70] Add bcftools stats modules
---
.github/workflows/ci.yml | 3 +
CHANGELOG.md | 1 +
assets/multiqc_config.yaml | 2 +
docs/images/nf-core_eager_logo_small.svg | 503 +++++++++++++++++++++++
environment.yml | 2 +-
main.nf | 50 ++-
nextflow.config | 1 +
nextflow_schema.json | 7 +
8 files changed, 556 insertions(+), 13 deletions(-)
create mode 100644 docs/images/nf-core_eager_logo_small.svg
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 442e3c4c8..10abe9f80 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -135,6 +135,9 @@ jobs:
- name: GENOTYPING_ANGSD Test running ANGSD genotype likelihood calculation
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --run_genotyping --genotyping_tool 'angsd'
+ - name: GENOTYPING_BCFTOOLS Test running FreeBayes with bcftools stats turned on
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'freebayes' --run_bcftools_stats
- name: SKIPPING Test checking all skip steps work i.e. input bam, skipping straight to genotyping
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_genotyping --genotyping_tool 'freebayes'
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3bc8ae070..2657eacf8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
- [#729](https://github.com/nf-core/eager/issues/729) Added Bowtie2 flag `--maxins` for PE mapping modern DNA mapping contexts
+- [#317](https://github.com/nf-core/eager/issues/317) Added bcftools stats for general genotyping statistics of VCF files
### `Fixed`
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
index 0d8c7c28a..d7e962a5d 100644
--- a/assets/multiqc_config.yaml
+++ b/assets/multiqc_config.yaml
@@ -25,6 +25,7 @@ run_modules:
- samtools
- sexdeterrmine
- hops
+ - bcftools
extra_fn_clean_exts:
- '_fastp'
@@ -91,6 +92,7 @@ top_modules:
- 'mtnucratio'
- 'qualimap'
- 'sexdeterrmine'
+ - 'bcftools'
- 'multivcfanalyzer':
path_filters:
- '*MultiVCFAnalyzer.json'
diff --git a/docs/images/nf-core_eager_logo_small.svg b/docs/images/nf-core_eager_logo_small.svg
new file mode 100644
index 000000000..9e756f4e7
--- /dev/null
+++ b/docs/images/nf-core_eager_logo_small.svg
@@ -0,0 +1,503 @@
+
+
diff --git a/environment.yml b/environment.yml
index 893175dce..d42c7295e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -49,4 +49,4 @@ dependencies:
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.0
- bioconda::bbmap=38.87
-
+ - bioconda::bcftools=1.12.1
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 274e96fad..c96d683ec 100644
--- a/main.nf
+++ b/main.nf
@@ -179,7 +179,7 @@ if("${params.fasta}".endsWith(".gz")){
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
output:
- path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling
+ path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats
script:
unzip = zipped_fasta.toString() - '.gz'
@@ -190,7 +190,7 @@ if("${params.fasta}".endsWith(".gz")){
} else {
fasta_for_indexing = Channel
.fromPath("${params.fasta}", checkIfExists: true)
- .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling }
+ .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats }
}
// Check that fasta index file path ends in '.fai'
@@ -2230,7 +2230,7 @@ process genotyping_ug {
file dict from ch_dict_for_ug.collect()
output:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*vcf.gz") into ch_ug_for_multivcfanalyzer,ch_ug_for_vcf2genome
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*vcf.gz") into ch_ug_for_multivcfanalyzer,ch_ug_for_vcf2genome,ch_ug_for_bcftools_stats
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.realign.{bam,bai}") optional true
script:
@@ -2245,7 +2245,7 @@ process genotyping_ug {
$keep_realign
- pigz -p ${task.cpus} ${samplename}.unifiedgenotyper.vcf
+ bgzip -@ ${task.cpus} ${samplename}.unifiedgenotyper.vcf
"""
else if (params.gatk_dbsnp != '')
"""
@@ -2256,7 +2256,7 @@ process genotyping_ug {
$keep_realign
- pigz -p ${task.cpus} ${samplename}.unifiedgenotyper.vcf
+ bgzip -@ ${task.cpus} ${samplename}.unifiedgenotyper.vcf
"""
}
@@ -2271,7 +2271,7 @@ process genotyping_hc {
params.run_genotyping && params.genotyping_tool == 'hc'
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_hc
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_hc,ch_hc_for_bcftools_stats
file fasta from ch_fasta_for_genotyping_hc.collect()
file fai from ch_fai_for_hc.collect()
file dict from ch_dict_for_hc.collect()
@@ -2283,13 +2283,13 @@ process genotyping_hc {
if (params.gatk_dbsnp == '')
"""
gatk HaplotypeCaller -R ${fasta} -I ${bam} -O ${samplename}.haplotypecaller.vcf -stand-call-conf ${params.gatk_call_conf} --sample-ploidy ${params.gatk_ploidy} --output-mode ${params.gatk_hc_out_mode} --emit-ref-confidence ${params.gatk_hc_emitrefconf}
- pigz -p ${task.cpus} ${samplename}.haplotypecaller.vcf
+ bgzip -@ ${task.cpus} ${samplename}.haplotypecaller.vcf
"""
else if (params.gatk_dbsnp != '')
"""
gatk HaplotypeCaller -R ${fasta} -I ${bam} -O ${samplename}.haplotypecaller.vcf --dbsnp ${params.gatk_dbsnp} -stand-call-conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} --output_mode ${params.gatk_hc_out_mode} --emit-ref-confidence ${params.gatk_hc_emitrefconf}
- pigz -p ${task.cpus} ${samplename}.haplotypecaller.vcf
+ bgzip -@ ${task.cpus} ${samplename}.haplotypecaller.vcf
"""
}
@@ -2304,7 +2304,7 @@ process genotyping_freebayes {
params.run_genotyping && params.genotyping_tool == 'freebayes'
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_freebayes
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_freebayes,ch_fb_for_bcftools_stats
file fasta from ch_fasta_for_genotyping_freebayes.collect()
file fai from ch_fai_for_freebayes.collect()
file dict from ch_dict_for_freebayes.collect()
@@ -2316,7 +2316,7 @@ process genotyping_freebayes {
def skip_coverage = "${params.freebayes_g}" == 0 ? "" : "-g ${params.freebayes_g}"
"""
freebayes -f ${fasta} -p ${params.freebayes_p} -C ${params.freebayes_C} ${skip_coverage} ${bam} > ${samplename}.freebayes.vcf
- pigz -p ${task.cpus} ${samplename}.freebayes.vcf
+ bgzip -@ ${task.cpus} ${samplename}.freebayes.vcf
"""
}
@@ -2448,6 +2448,31 @@ process genotyping_angsd {
"""
}
+////////////////////////////////////
+/* -- GENOTYPING STATS -- */
+////////////////////////////////////
+
+process bcftools_stats {
+ label 'mc_small'
+ tag "${samplename}"
+ publishDir "${params.outdir}/bcftools/stats", mode: params.publish_dir_mode
+
+ when:
+ params.run_bcftools_stats
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(vcf) from ch_ug_for_vcf2genome.mix(ch_hc_for_bcftools_stats,ch_fb_for_bcftools_stats)
+ file fasta from ch_fasta_for_bcftools_stats.collect()
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.vcf.stats") into ch_bcftools_stats_for_multiqc
+
+ script:
+ """
+ bcftools stats *.vcf.gz -F ${fasta} > ${samplename}.vcf.stats
+ """
+}
+
////////////////////////////////////
/* -- CONSENSUS CALLING -- */
////////////////////////////////////
@@ -2473,7 +2498,7 @@ process vcf2genome {
def out = "${params.vcf2genome_outfile}" == '' ? "${samplename}.fasta" : "${params.vcf2genome_outfile}"
def fasta_head = "${params.vcf2genome_header}" == '' ? "${samplename}" : "${params.vcf2genome_header}"
"""
- pigz -f -d -p ${task.cpus} *.vcf.gz
+ bgzip -f -d -@ ${task.cpus} *.vcf.gz
vcf2genome -Xmx${task.memory.toGiga()}g -draft ${out}.fasta -draftname "${fasta_head}" -in ${vcf.baseName} -minc ${params.vcf2genome_minc} -minfreq ${params.vcf2genome_minfreq} -minq ${params.vcf2genome_minq} -ref ${fasta} -refMod ${out}_refmod.fasta -uncertain ${out}_uncertainy.fasta
pigz -p ${task.cpus} *.fasta
pigz -p ${task.cpus} *.vcf
@@ -2516,7 +2541,7 @@ process multivcfanalyzer {
script:
def write_freqs = params.write_allele_frequencies ? "T" : "F"
"""
- gunzip -f *.vcf.gz
+ bgzip -f -d -@ ${task.cpus} *.vcf.gz
multivcfanalyzer -Xmx${task.memory.toGiga()}g ${params.snp_eff_results} ${fasta} ${params.reference_gff_annotations} . ${write_freqs} ${params.min_genotype_quality} ${params.min_base_coverage} ${params.min_allele_freq_hom} ${params.min_allele_freq_het} ${params.reference_gff_exclude} *.vcf
pigz -p ${task.cpus} *.tsv *.txt snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta fullAlignment.fasta
"""
@@ -2998,6 +3023,7 @@ process multiqc {
file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([])
file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([])
file ('genotyping/*') from ch_eigenstrat_snp_cov_for_multiqc.collect().ifEmpty([])
+ file ('bcftools_stats') from ch_bcftools_stats_for_multiqc.collect().ifEmpty([])
file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")
output:
diff --git a/nextflow.config b/nextflow.config
index 72127c379..fecdbd504 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -167,6 +167,7 @@ params {
angsd_glformat = 'binary'
angsd_createfasta = false
angsd_fastamethod = 'random'
+ run_bcftools_stats = test_stresstest_human
//Consensus sequence generation
run_vcf2genome = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 26a2fbf0f..14d6acbd1 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -1147,6 +1147,13 @@
"random",
"common"
]
+ },
+ "run_bcftools_stats": {
+ "type": "boolean",
+ "default": true,
+ "description": "Turn on bcftools stats generation for VCF based variant calling statistics",
+ "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.",
+ "fa_icon": "far fa-chart-bar"
}
},
"fa_icon": "fas fa-sliders-h",
From 51f5790c63a325c99a53334a0eb54a75760acd2c Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 3 May 2021 10:08:53 +0200
Subject: [PATCH 02/70] Add bcftools - still missing output docs
---
environment.yml | 2 +-
main.nf | 28 ++++++++++++++++++++++------
nextflow.config | 2 +-
3 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/environment.yml b/environment.yml
index d42c7295e..739f18e6d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -49,4 +49,4 @@ dependencies:
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.0
- bioconda::bbmap=38.87
- - bioconda::bcftools=1.12.1
\ No newline at end of file
+ - bioconda::bcftools=1.9
diff --git a/main.nf b/main.nf
index c96d683ec..20d82dceb 100644
--- a/main.nf
+++ b/main.nf
@@ -190,7 +190,7 @@ if("${params.fasta}".endsWith(".gz")){
} else {
fasta_for_indexing = Channel
.fromPath("${params.fasta}", checkIfExists: true)
- .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats }
+ .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling;ch_fasta_for_bcftools_stats }
}
// Check that fasta index file path ends in '.fai'
@@ -2213,8 +2213,24 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
}
+
+
// Unified Genotyper - although not-supported, better for aDNA (because HC does de novo assembly which requires higher coverages), and needed for MultiVCFAnalyzer
+// initialise empty bcftool related empty channels
+
+if ( params.genotyping_tool == 'ug' ) {
+ ch_hc_for_bcftools_stats = Channel.empty()
+ ch_fb_for_bcftools_stats = Channel.empty()
+} else if ( params.genotyping_tool == 'hc' ) {
+ ch_ug_for_bcftools_stats = Channel.empty()
+ ch_fb_for_bcftools_stats = Channel.empty()
+} else if ( params.genotyping_tool == 'fb ') {
+ ch_ug_for_bcftools_stats = Channel.empty()
+ ch_hc_for_bcftools_stats = Channel.empty()
+}
+
+
process genotyping_ug {
label 'mc_small'
tag "${samplename}"
@@ -2271,13 +2287,13 @@ process genotyping_hc {
params.run_genotyping && params.genotyping_tool == 'hc'
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_hc,ch_hc_for_bcftools_stats
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_hc
file fasta from ch_fasta_for_genotyping_hc.collect()
file fai from ch_fai_for_hc.collect()
file dict from ch_dict_for_hc.collect()
output:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*vcf.gz")
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*vcf.gz") into ch_hc_for_bcftools_stats
script:
if (params.gatk_dbsnp == '')
@@ -2304,13 +2320,13 @@ process genotyping_freebayes {
params.run_genotyping && params.genotyping_tool == 'freebayes'
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_freebayes,ch_fb_for_bcftools_stats
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_damagemanipulation_for_genotyping_freebayes
file fasta from ch_fasta_for_genotyping_freebayes.collect()
file fai from ch_fai_for_freebayes.collect()
file dict from ch_dict_for_freebayes.collect()
output:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*vcf.gz")
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*vcf.gz") into ch_fb_for_bcftools_stats
script:
def skip_coverage = "${params.freebayes_g}" == 0 ? "" : "-g ${params.freebayes_g}"
@@ -2461,7 +2477,7 @@ process bcftools_stats {
params.run_bcftools_stats
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(vcf) from ch_ug_for_vcf2genome.mix(ch_hc_for_bcftools_stats,ch_fb_for_bcftools_stats)
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(vcf) from ch_ug_for_bcftools_stats.mix(ch_hc_for_bcftools_stats,ch_fb_for_bcftools_stats)
file fasta from ch_fasta_for_bcftools_stats.collect()
output:
diff --git a/nextflow.config b/nextflow.config
index fecdbd504..f4fc14a0f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -167,7 +167,7 @@ params {
angsd_glformat = 'binary'
angsd_createfasta = false
angsd_fastamethod = 'random'
- run_bcftools_stats = test_stresstest_human
+ run_bcftools_stats = true
//Consensus sequence generation
run_vcf2genome = false
From 56a50bc6daa3f1bf0bd70e90e77fe1d7a2677eeb Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 3 May 2021 11:19:54 +0200
Subject: [PATCH 03/70] Remove unnecessary branching
---
main.nf | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/main.nf b/main.nf
index 20d82dceb..b80230641 100644
--- a/main.nf
+++ b/main.nf
@@ -2219,16 +2219,16 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
// initialise empty bcftool related empty channels
-if ( params.genotyping_tool == 'ug' ) {
- ch_hc_for_bcftools_stats = Channel.empty()
- ch_fb_for_bcftools_stats = Channel.empty()
-} else if ( params.genotyping_tool == 'hc' ) {
- ch_ug_for_bcftools_stats = Channel.empty()
- ch_fb_for_bcftools_stats = Channel.empty()
-} else if ( params.genotyping_tool == 'fb ') {
- ch_ug_for_bcftools_stats = Channel.empty()
- ch_hc_for_bcftools_stats = Channel.empty()
-}
+//if ( params.genotyping_tool == 'ug' ) {
+// ch_hc_for_bcftools_stats = Channel.empty()
+// ch_fb_for_bcftools_stats = Channel.empty()
+//} else if ( params.genotyping_tool == 'hc' ) {
+// ch_ug_for_bcftools_stats = Channel.empty()
+// ch_fb_for_bcftools_stats = Channel.empty()
+//} else if ( params.genotyping_tool == 'fb ') {
+// ch_ug_for_bcftools_stats = Channel.empty()
+// ch_hc_for_bcftools_stats = Channel.empty()
+//}
process genotyping_ug {
From f9bc10a6e5bcc054a5d8791df4788e500ed6c0e6 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 5 May 2021 22:19:17 +0200
Subject: [PATCH 04/70] Add docs and fix compatibility with MVA and VCF2Genomes
---
docs/output.md | 26 ++++++++++++++++++++++++++
main.nf | 8 ++++----
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/docs/output.md b/docs/output.md
index cc07d9a69..7bd1084b3 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -618,6 +618,31 @@ If this correlation is not observed, your data is skewed towards higher coverage
+### Bcftools
+
+### Background
+
+Bcftools is a toolkit for processing and summaries VCF files, i.e. variant call format files. nf-core/eager currently uses bcftools for the `stats` functionality. This summarises in a text file a range of statistics about VCF files, produced by GATK and FreeBayes variant callers.
+
+#### Variant Substitution Types
+
+This stack bar plot shows you the distribution of all types of point-mutation variants away from the reference nucleotide at each position, (e.g. A>C, A>G etc.).
+
+For low-coverage non-UDG treated, non-trimmed nor re-scaled aDNA data, you expect to see a C>T substitutions as the largest category, due to the most common ancient DNA damage being C to T deamination.
+
+#### Variant Quality
+
+This gives you the distribution of variant-call _qualities_ in your VCF files. Each variant will get given a 'Phred-scale' like value that represents the confidence of the variant caller that it has made the right call. The scale is very similar to that of base-call values in FASTQ files (as assessed by FastQC). Distributions that have peaks at higher variant quality scores (>= 30) suggest more confident variant calls. However, in cases of low-coverage aDNA data, these distributions may not be so good.
+
+More detailed explanation of variant quality scores can be seen in the Broad Institute's [GATK documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360035531872-Phred-scaled-quality-scores).
+
+#### Indel Distribution
+
+This plot shows you the distribution of the sizes of insertion- and deletions (InDels) in the variant calling (assuming you configured your variant caller parameters to do so). Low-coverage aDNA data often will not have high enough coverage to accurately assess InDels. In cases of high-coverage data of small-genomes such as microbes, large numbers of InDels, however, may indicate your reads are actually from a _relative_ of the reference mapped to - and should be verified downstream.
+#### Variant depths
+
+This plot shows the distribution of depth coverages of each variant called. Typically higher coverage will result in higher quality variant calls (see Variant Quality, above), however in many cases in aDNA these may be low and unequally distributed (due to uneven mapping coverage from contamination).
+
### MultiVCFAnalyzer
#### Background
@@ -674,3 +699,4 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
* `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
* `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
* `librarymerged_bams/`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
+* `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied.
\ No newline at end of file
diff --git a/main.nf b/main.nf
index b80230641..e8d9982a0 100644
--- a/main.nf
+++ b/main.nf
@@ -2514,9 +2514,9 @@ process vcf2genome {
def out = "${params.vcf2genome_outfile}" == '' ? "${samplename}.fasta" : "${params.vcf2genome_outfile}"
def fasta_head = "${params.vcf2genome_header}" == '' ? "${samplename}" : "${params.vcf2genome_header}"
"""
- bgzip -f -d -@ ${task.cpus} *.vcf.gz
- vcf2genome -Xmx${task.memory.toGiga()}g -draft ${out}.fasta -draftname "${fasta_head}" -in ${vcf.baseName} -minc ${params.vcf2genome_minc} -minfreq ${params.vcf2genome_minfreq} -minq ${params.vcf2genome_minq} -ref ${fasta} -refMod ${out}_refmod.fasta -uncertain ${out}_uncertainy.fasta
- pigz -p ${task.cpus} *.fasta
+ pigz -d -f -p ${task.cpus} ${vcf}
+ vcf2genome -Xmx${task.memory.toGiga()}g -draft ${out} -draftname "${fasta_head}" -in ${vcf.baseName} -minc ${params.vcf2genome_minc} -minfreq ${params.vcf2genome_minfreq} -minq ${params.vcf2genome_minq} -ref ${fasta} -refMod ${out}_refmod.fasta -uncertain ${out}_uncertainty.fasta
+ pigz -f -p ${task.cpus} ${out}*
pigz -p ${task.cpus} *.vcf
"""
}
@@ -2557,7 +2557,7 @@ process multivcfanalyzer {
script:
def write_freqs = params.write_allele_frequencies ? "T" : "F"
"""
- bgzip -f -d -@ ${task.cpus} *.vcf.gz
+ pigz -d -f -p ${task.cpus} ${vcf}
multivcfanalyzer -Xmx${task.memory.toGiga()}g ${params.snp_eff_results} ${fasta} ${params.reference_gff_annotations} . ${write_freqs} ${params.min_genotype_quality} ${params.min_base_coverage} ${params.min_allele_freq_hom} ${params.min_allele_freq_het} ${params.reference_gff_exclude} *.vcf
pigz -p ${task.cpus} *.tsv *.txt snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta fullAlignment.fasta
"""
From e0fae4efe6c7541866749ff0cf47ff1028b7c6db Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Thu, 6 May 2021 08:37:41 +0200
Subject: [PATCH 05/70] Update output.md
---
docs/output.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/docs/output.md b/docs/output.md
index 7bd1084b3..b3030d6b1 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -639,6 +639,7 @@ More detailed explanation of variant quality scores can be seen in the Broad Ins
#### Indel Distribution
This plot shows you the distribution of the sizes of insertion- and deletions (InDels) in the variant calling (assuming you configured your variant caller parameters to do so). Low-coverage aDNA data often will not have high enough coverage to accurately assess InDels. In cases of high-coverage data of small-genomes such as microbes, large numbers of InDels, however, may indicate your reads are actually from a _relative_ of the reference mapped to - and should be verified downstream.
+
#### Variant depths
This plot shows the distribution of depth coverages of each variant called. Typically higher coverage will result in higher quality variant calls (see Variant Quality, above), however in many cases in aDNA these may be low and unequally distributed (due to uneven mapping coverage from contamination).
@@ -699,4 +700,4 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
* `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
* `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
* `librarymerged_bams/`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
-* `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied.
\ No newline at end of file
+* `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied.
From 9032bf6acc315629ac549ea815ea7c94f04130dc Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 6 May 2021 20:58:59 +0200
Subject: [PATCH 06/70] Remove now uncessary CONTRIBUTING step due to new magic
groovy functions
---
.github/CONTRIBUTING.md | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 3e4a4cfa2..75b61b9ff 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -70,14 +70,13 @@ If you wish to contribute a new step, please use the following coding standards:
3. Define the output channel if needed (see below).
4. Add any new flags/options to `nextflow.config` with a default (see below).
5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build .`).
-6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter).
-7. Add sanity checks for all relevant parameters.
-8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`.
-9. Do local tests that the new code works properly and as expected.
-10. Add a new test command in `.github/workflow/ci.yaml`.
-11. If applicable add a [MultiQC](https://https://multiqc.info/) module.
-12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order.
-13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`.
+6. Add sanity checks for all relevant parameters.
+7. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`.
+8. Do local tests that the new code works properly and as expected.
+9. Add a new test command in `.github/workflow/ci.yaml`.
+10. If applicable add a [MultiQC](https://https://multiqc.info/) module.
+11. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order.
+12. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`.
### Default values
From 3702a07008b820077549b3f673a934e2a65e45bb Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 6 May 2021 22:14:45 +0200
Subject: [PATCH 07/70] Software version reporting fixes and additions
---
bin/scrape_software_versions.py | 6 ++++--
main.nf | 6 ++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 5c9c0da9c..2e28d168e 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -16,7 +16,7 @@
'Bowtie2': ['v_bowtie2.txt', r"bowtie2-([0-9]+\.[0-9]+\.[0-9]+) -fdebug"],
'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"],
'GATK HaplotypeCaller': ['v_gatk.txt', r" v(\S+)"],
- #'GATK UnifiedGenotyper': ['v_gatk3_5.txt', r"version (\S+)"],
+ 'GATK UnifiedGenotyper': ['v_gatk3.txt', r"(\S+)"],
'bamUtil' : ['v_bamutil.txt', r"Version: (\S+);"],
'fastP': ['v_fastp.txt', r"([\d\.]+)"],
'DamageProfiler' : ['v_damageprofiler.txt', r"DamageProfiler v(\S+)"],
@@ -37,7 +37,8 @@
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
- 'bbduk':['v_bbduk.txt',r"(\S+)"]
+ 'bbduk':['v_bbduk.txt',r"(\S+\ .+)"],
+ 'bcftools':['v_bcftools.txt',r"(\S+)"]
}
results = OrderedDict()
@@ -75,6 +76,7 @@
results['eigenstrat_snp_coverage'] = 'N/A'
results['mapDamage2'] = 'N/A'
results['bbduk'] = 'N/A'
+results['bcftools'] = 'N/A'
# Search each file using its regex
for k, v in regexes.items():
diff --git a/main.nf b/main.nf
index e8d9982a0..426e003b9 100644
--- a/main.nf
+++ b/main.nf
@@ -2977,6 +2977,7 @@ process get_software_versions {
qualimap --version &> v_qualimap.txt 2>&1 || true
preseq &> v_preseq.txt 2>&1 || true
gatk --version 2>&1 | head -n 1 > v_gatk.txt 2>&1 || true
+ gatk3 --version 2>&1 | head -n 1 > v_gatk3.txt 2>&1 || true
freebayes --version &> v_freebayes.txt 2>&1 || true
bedtools --version &> v_bedtools.txt 2>&1 || true
damageprofiler --version &> v_damageprofiler.txt 2>&1 || true
@@ -2995,8 +2996,9 @@ process get_software_versions {
pileupCaller --version &> v_sequencetools.txt 2>&1 || true
bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
- mapDamage2 --version > v_mapdamage.txt || true
- bbduk.sh | grep 'Last modified' | cut -d' ' -f 3-99 > v_bbduk.txt || true
+ mapDamage --version > v_mapdamage.txt || true
+ bbduk.sh | grep 'Last modified' | cut -d ' ' -f 3-99 > v_bbduk.txt || true
+ bcftools --version | grep 'bcftools' | cut -d ' ' -f 2 > v_bcftools.txt || true
scrape_software_versions.py &> software_versions_mqc.yaml
"""
From a47f7442d21d9f8488fc21b8bac3709fe1419ab9 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 6 May 2021 22:16:16 +0200
Subject: [PATCH 08/70] Update changelog
---
CHANGELOG.md | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2657eacf8..3b155f559 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,15 +3,27 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## v2.3.4dev - [unreleased]
+## v2.4dev - [unreleased]
### `Added`
-- [#729](https://github.com/nf-core/eager/issues/729) Added Bowtie2 flag `--maxins` for PE mapping modern DNA mapping contexts
- [#317](https://github.com/nf-core/eager/issues/317) Added bcftools stats for general genotyping statistics of VCF files
### `Fixed`
+- Fixed some missing or incorrectly reported software versions
+
+### `Dependencies`
+
+### `Deprecated`
+
+## v2.3.4dev - [unreleased]
+
+### `Added`
+
+- [#729](https://github.com/nf-core/eager/issues/729) Added Bowtie2 flag `--maxins` for PE mapping modern DNA mapping contexts
+### `Fixed`
+
- Corrected explanation of the "--min_adap_overlap" parameter for AdapterRemoval in the docs
- [#725](https://github.com/nf-core/eager/pull/725) `bwa_index` doc update
- Re-adds gzip piping to AdapterRemovalFixPrefix to speed up process after reports of being very slow
From d92bcf20640612f7b55da8c5808ac62735d7c191 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 7 May 2021 14:39:02 +0200
Subject: [PATCH 09/70] Update CHANGELOG.md
---
CHANGELOG.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b155f559..94743a677 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
- [#729](https://github.com/nf-core/eager/issues/729) Added Bowtie2 flag `--maxins` for PE mapping modern DNA mapping contexts
+
### `Fixed`
- Corrected explanation of the "--min_adap_overlap" parameter for AdapterRemoval in the docs
From a48953e59b0b3b48d58046b850cc4c2d4395b9f9 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Fri, 21 May 2021 15:51:16 +0200
Subject: [PATCH 10/70] Add basic functionality for barcode trimming -
---
README.md | 3 +-
assets/multiqc_config.yaml | 12 ++---
docs/output.md | 6 ++-
main.nf | 93 ++++++++++++++++++++++++++++++++++++--
nextflow.config | 5 ++
nextflow_schema.json | 36 ++++++++++++---
6 files changed, 134 insertions(+), 21 deletions(-)
diff --git a/README.md b/README.md
index 5997625c6..2e28463d2 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ By default the pipeline currently performs the following:
* Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`)
* Sequencing quality control (`FastQC`)
-* Sequencing adapter removal and for paired end data merging (`AdapterRemoval`)
+* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`)
* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`)
* Post-mapping processing, statistics and conversion to bam (`samtools`)
* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`)
@@ -85,6 +85,7 @@ Additional functionality contained by the pipeline currently includes:
#### Preprocessing
* Illumina two-coloured sequencer poly-G tail removal (`fastp`)
+* Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`)
* Automatic conversion of unmapped reads to FASTQ (`samtools`)
* Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples)
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
index 0d8c7c28a..7bab487a8 100644
--- a/assets/multiqc_config.yaml
+++ b/assets/multiqc_config.yaml
@@ -60,13 +60,13 @@ extra_fn_clean_exts:
top_modules:
- 'fastqc':
- name: 'FastQC (pre-AdapterRemoval)'
+ name: 'FastQC (pre-Trimming)'
path_filters:
- '*_raw_fastqc.zip'
- 'fastp'
- 'adapterRemoval'
- 'fastqc':
- name: 'FastQC (post-AdapterRemoval)'
+ name: 'FastQC (post-Trimming)'
path_filters:
- '*.truncated_fastqc.zip'
- '*.combined*_fastqc.zip'
@@ -106,7 +106,7 @@ remove_sections:
- sexdeterrmine-snps
table_columns_visible:
- FastQC (pre-AdapterRemoval):
+ FastQC (pre-Trimming):
percent_duplicates: False
percent_gc: True
avg_sequence_length: True
@@ -117,7 +117,7 @@ table_columns_visible:
Adapter Removal:
aligned_total: False
percent_aligned: True
- FastQC (post-AdapterRemoval):
+ FastQC (post-Trimming):
avg_sequence_length: True
percent_duplicates: False
total_sequences: True
@@ -180,7 +180,7 @@ table_columns_visible:
Total_Snps: False
table_columns_placement:
- FastQC (pre-AdapterRemoval):
+ FastQC (pre-Trimming):
total_sequences: 100
avg_sequence_length: 110
percent_gc: 120
@@ -188,7 +188,7 @@ table_columns_placement:
after_filtering_gc_content: 200
Adapter Removal:
percent_aligned: 300
- FastQC (post-AdapterRemoval):
+ FastQC (post-Trimming):
total_sequences: 400
avg_sequence_length: 410
percent_gc: 420
diff --git a/docs/output.md b/docs/output.md
index cc07d9a69..978b01f70 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -112,7 +112,8 @@ When dealing with ancient DNA data the MultiQC plots for FastQC will often show
For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
-> **NB:** The FastQC (pre-AdapterRemoval) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-AdapterRemoval). You should expect after AdapterRemoval, that most of the artefacts are removed.
+> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed.
+> :warning: If you turned on `post_ar_fastq_trimming` your 'post-Trimming' report will _include_ reads that were additionally trimmed. There is no separate report for the post-AdapterRemoval trimming.
#### Sequence Counts
@@ -648,7 +649,8 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
* `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag.
* `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval.
-* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) FASTQ files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
+* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
+* `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping.
* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!).
* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
diff --git a/main.nf b/main.nf
index a45bc8943..2e9173d0d 100644
--- a/main.nf
+++ b/main.nf
@@ -932,14 +932,97 @@ if (!params.skip_adapterremoval) {
ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
.filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
.dump(tag: "AR Bypass")
- .into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; }
+ .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
} else {
ch_fastp_for_skipadapterremoval
- .into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; }
+ .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
}
+// Post AR fastq trimming
+
+process post_ar_fastq_trimming {
+ label 'mc_small'
+ tag "${libraryid}"
+ publishDir "${params.outdir}/post_ar_fastq_trimmed", mode: params.publish_dir_mode
+
+ when: params.run_post_ar_trimming
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_adapterremoval_for_post_ar_trimming
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_postartrimmed.fq.gz") into ch_post_ar_trimming_for_lanemerge_r1
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R2_postartrimmed.fq.gz") optional true into ch_post_ar_trimming_for_lanemerge_r2
+
+ script:
+ if ( seqtype == 'SE' | (seqtype == 'PE' && !params.skip_collapse) ) {
+ """
+ fastp --in1 ${r1} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_R1_postartrimmed.fq.gz
+ """
+ } else if ( seqtype == 'PE' && params.skip_collapse ) {
+ """
+ fastp --in1 ${r1} --in2 ${r2} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} --trim_front2 ${params.post_ar_trim_front2} --trim_tail2 ${params.post_ar_trim_tail2} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_R1_postartrimmed.fq.gz --out2 "${libraryid}"_R2_postartrimmed.fq.gz
+ """
+ }
+
+}
+
+// When not collapsing paired-end data, re-merge the R1 and R2 files into single map. Otherwise if SE or collapsed PE, R2 now becomes NA
+// Sort to make sure we get consistent R1 and R2 ordered when using `-resume`, even if not needed for FastQC
+if ( params.skip_collapse ){
+ ch_post_ar_trimming_for_lanemerge_r1
+ .mix(ch_post_ar_trimming_for_lanemerge_r2)
+ .groupTuple(by: [0,1,2,3,4,5,6])
+ .map{
+ it ->
+ def samplename = it[0]
+ def libraryid = it[1]
+ def lane = it[2]
+ def seqtype = it[3]
+ def organism = it[4]
+ def strandedness = it[5]
+ def udg = it[6]
+ def r1 = file(it[7].sort()[0])
+ def r2 = seqtype == "PE" ? file(it[7].sort()[1]) : file("$projectDir/assets/nf-core_eager_dummy.txt")
+
+ [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
+
+ }
+ .set { ch_post_ar_trimming_for_lanemerge; }
+} else {
+ ch_post_ar_trimming_for_lanemerge_r1
+ .map{
+ it ->
+ def samplename = it[0]
+ def libraryid = it[1]
+ def lane = it[2]
+ def seqtype = it[3]
+ def organism = it[4]
+ def strandedness = it[5]
+ def udg = it[6]
+ def r1 = file(it[7])
+ def r2 = file("$projectDir/assets/nf-core_eager_dummy.txt")
+
+ [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
+ }
+ .set { ch_post_ar_trimming_for_lanemerge; }
+}
+
+
+// Inline barcode removal bypass when not running it
+if (params.run_post_ar_trimming) {
+ ch_post_ar_trimming_for_lanemerge.mix(ch_adapterremoval_for_skip_post_ar_trimming)
+ .dump(tag: "Inline Removal Bypass")
+ .into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
+} else {
+ ch_adapterremoval_for_skip_post_ar_trimming
+ .into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
+}
+
+
+
// Lane merging for libraries sequenced over multiple lanes (e.g. NextSeq)
-ch_branched_for_lanemerge = ch_adapterremoval_for_lanemerge
+ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
.groupTuple(by: [0,1,3,4,5,6])
.map {
it ->
@@ -1100,7 +1183,7 @@ process lanemerge_hostremoval_fastq {
}
-// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts
+// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts. If doing post-AR trimming includes this step in output.
process fastqc_after_clipping {
label 'mc_small'
@@ -1114,7 +1197,7 @@ process fastqc_after_clipping {
when: !params.skip_adapterremoval && !params.skip_fastqc
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_adapterremoval_for_fastqc_after_clipping
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_inlinebarcoderemoval_for_fastqc_after_clipping
output:
path("*_fastqc.{zip,html}") into ch_fastqc_after_clipping
diff --git a/nextflow.config b/nextflow.config
index 3e85581f4..bda8ba8f5 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -74,6 +74,11 @@ params {
preserve5p = false
mergedonly = false
qualitymax = 41
+ run_post_ar_trimming = false
+ post_ar_trim_front = 7
+ post_ar_trim_tail = 7
+ post_ar_trim_front2 = 7
+ post_ar_trim_tail2 = 7
//Mapping algorithm
mapper = 'bwaaln'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 64814061c..23b12ab51 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -516,6 +516,35 @@
"help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`",
"default": 41,
"fa_icon": "fas fa-arrow-up"
+ },
+ "run_post_ar_trimming": {
+ "type": "boolean",
+ "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)",
+ "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n"
+ },
+ "post_ar_trim_front": {
+ "type": "integer",
+ "default": 7,
+ "description": "Specify the number of bases to trim off the front of a merged read or R1",
+ "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`"
+ },
+ "post_ar_trim_tail": {
+ "type": "integer",
+ "default": 7,
+ "description": "Specify the number of bases to trim off the tail of of a merged read or R1",
+ "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`"
+ },
+ "post_ar_trim_front2": {
+ "type": "integer",
+ "default": 7,
+ "description": "Specify the number of bases to trim off the front of R2",
+ "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`"
+ },
+ "post_ar_trim_tail2": {
+ "type": "integer",
+ "default": 7,
+ "description": "Specify the number of bases to trim off the tail of R2",
+ "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`"
}
},
"fa_icon": "fas fa-cut",
@@ -616,7 +645,6 @@
},
"bt2n": {
"type": "integer",
- "default": 0,
"description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.",
"fa_icon": "fas fa-sort-numeric-down",
"help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`",
@@ -627,21 +655,18 @@
},
"bt2l": {
"type": "integer",
- "default": 0,
"description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.",
"fa_icon": "fas fa-ruler-horizontal",
"help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`"
},
"bt2_trim5": {
"type": "integer",
- "default": 0,
"description": "Specify number of bases to trim off from 5' (left) end of read before alignment.",
"fa_icon": "fas fa-cut",
"help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`"
},
"bt2_trim3": {
"type": "integer",
- "default": 0,
"description": "Specify number of bases to trim off from 3' (right) end of read before alignment.",
"fa_icon": "fas fa-cut",
"help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`"
@@ -699,14 +724,12 @@
},
"bam_mapping_quality_threshold": {
"type": "integer",
- "default": 0,
"description": "Minimum mapping quality for reads filter.",
"fa_icon": "fas fa-greater-than-equal",
"help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`"
},
"bam_filter_minreadlength": {
"type": "integer",
- "default": 0,
"fa_icon": "fas fa-ruler-horizontal",
"description": "Specify minimum read length to be kept after mapping.",
"help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`"
@@ -1071,7 +1094,6 @@
},
"freebayes_g": {
"type": "integer",
- "default": 0,
"description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.",
"fa_icon": "fab fa-think-peaks",
"help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`"
From 5ed9de6233c6dca561da2b2fa7d6d65b05fe5ab2 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 26 May 2021 09:23:59 +0200
Subject: [PATCH 11/70] Add CI tests
---
.github/workflows/ci.yml | 3 +++
docs/output.md | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 213c2ac69..1ab27185b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -102,6 +102,9 @@ jobs:
- name: ADAPTERREMOVAL Run the basic pipeline with preserve5p end and merged reads only options
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preserve5p --mergedonly
+ - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming
- name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
diff --git a/docs/output.md b/docs/output.md
index 978b01f70..c7b7c6d3a 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -113,7 +113,7 @@ When dealing with ancient DNA data the MultiQC plots for FastQC will often show
For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed.
-> :warning: If you turned on `post_ar_fastq_trimming` your 'post-Trimming' report will _include_ reads that were additionally trimmed. There is no separate report for the post-AdapterRemoval trimming.
+> :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming.
#### Sequence Counts
From 582dcf508cdadfc163f29b5b0ce76cff42421bee Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Fri, 4 Jun 2021 09:58:39 +0200
Subject: [PATCH 12/70] Post release version bump
---
.github/workflows/ci.yml | 4 ++--
CHANGELOG.md | 10 ++++++++++
Dockerfile | 4 ++--
environment.yml | 4 ++--
nextflow.config | 4 ++--
5 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 153a4befd..213c2ac69 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,13 +37,13 @@ jobs:
- name: Build new docker image
if: env.MATCHED_FILES
- run: docker build --no-cache . -t nfcore/eager:2.3.5
+ run: docker build --no-cache . -t nfcore/eager:dev
- name: Pull docker image
if: ${{ !env.MATCHED_FILES }}
run: |
docker pull nfcore/eager:dev
- docker tag nfcore/eager:dev nfcore/eager:2.3.5
+ docker tag nfcore/eager:dev nfcore/eager:dev
- name: Install Nextflow
env:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 254ce499f..c4f09342c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## v2.3.6dev - [unreleased]
+
+### `Added`
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Deprecated`
+
## v2.3.5 - 2021-06-03
### `Added`
diff --git a/Dockerfile b/Dockerfile
index fc295bd15..441ad6698 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ COPY environment.yml /
RUN conda env create --quiet -f /environment.yml && conda clean -a
# Add conda installation dir to PATH (instead of doing 'conda activate')
-ENV PATH /opt/conda/envs/nf-core-eager-2.3.5/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-eager-2.3.6dev/bin:$PATH
# Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-eager-2.3.5 > nf-core-eager-2.3.5.yml
\ No newline at end of file
+RUN conda env export --name nf-core-eager-2.3.6dev > nf-core-eager-2.3.6dev.yml
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index f752203a6..81320ede4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
-name: nf-core-eager-2.3.5
+name: nf-core-eager-2.3.6dev
channels:
- conda-forge
- bioconda
@@ -48,4 +48,4 @@ dependencies:
- bioconda::bowtie2=2.4.2
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.0
- - bioconda::bbmap=38.87
+ - bioconda::bbmap=38.87
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index db0a875d2..737d94fa3 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -266,7 +266,7 @@ params {
// Container slug. Stable releases should specify release tag!
// Developmental code should specify :dev
-process.container = 'nfcore/eager:2.3.5'
+process.container = 'nfcore/eager:dev'
// Load base.config by default for all pipelines
includeConfig 'conf/base.config'
@@ -396,7 +396,7 @@ manifest {
description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline'
mainScript = 'main.nf'
nextflowVersion = '>=20.07.1'
- version = '2.3.5'
+ version = '2.3.6dev'
}
// Function to ensure that resource requirements don't go beyond
From 831cb5e689d39d7461f46c57f3f85815e9c07b63 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 4 Jun 2021 10:01:39 +0200
Subject: [PATCH 13/70] Markdown linking
---
CHANGELOG.md | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4f09342c..b2f082123 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,15 +3,15 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## v2.3.6dev - [unreleased]
-
-### `Added`
-
-### `Fixed`
-
-### `Dependencies`
-
-### `Deprecated`
+## v2.3.6dev - [unreleased]
+
+### `Added`
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Deprecated`
## v2.3.5 - 2021-06-03
From 0b5d95a8fcf014caf0523e131c6a02c88f02e06f Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 7 Jun 2021 14:48:21 +0200
Subject: [PATCH 14/70] Bump pipeline and tool versions
---
Dockerfile | 4 ++--
environment.yml | 47 +++++++++++++++++++++++------------------------
nextflow.config | 4 ++--
3 files changed, 27 insertions(+), 28 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index 005098d7e..897d4deae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ COPY environment.yml /
RUN conda env create --quiet -f /environment.yml && conda clean -a
# Add conda installation dir to PATH (instead of doing 'conda activate')
-ENV PATH /opt/conda/envs/nf-core-eager-2.3.5dev/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-eager-2.4dev/bin:$PATH
# Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-eager-2.3.5dev > nf-core-eager-2.3.5dev.yml
+RUN conda env export --name nf-core-eager-2.4dev > nf-core-eager-2.4dev.yml
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 0b147d356..127be355e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,52 +1,51 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
-name: nf-core-eager-2.3.5dev
+name: nf-core-eager-2.4dev
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- - conda-forge::python=3.7.3
- - conda-forge::markdown=3.2.2 #Don't upgrade anymore
- - conda-forge::pymdown-extensions=7.1
- - conda-forge::pygments=2.6.1
+ - conda-forge::python=3.9.4
+ - conda-forge::markdown=3.3.4
+ - conda-forge::pymdown-extensions=8.2
+ - conda-forge::pygments=2.9.0
- bioconda::rename=1.601
- conda-forge::openjdk=8.0.144 # Don't upgrade - required for GATK
- bioconda::fastqc=0.11.9
- - bioconda::adapterremoval=2.3.1
+ - bioconda::adapterremoval=2.3.2
- bioconda::adapterremovalfixprefix=0.0.5
- bioconda::bwa=0.7.17
- - bioconda::picard=2.22.9
- - bioconda::samtools=1.9
+ - bioconda::picard=2.25.5
+ - bioconda::samtools=1.12
- bioconda::dedup=0.12.8
- - bioconda::angsd=0.933
+ - bioconda::angsd=0.935
- bioconda::circularmapper=1.93.5
- - bioconda::gatk4=4.1.7.0
+ - bioconda::gatk4=4.2.0.0
- bioconda::gatk=3.5 ## Don't upgrade - required for MultiVCFAnalyzer
- bioconda::qualimap=2.2.2d
- bioconda::vcf2genome=0.91
- bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8
- bioconda::multiqc=1.10.1
- bioconda::pmdtools=0.60
- - bioconda::bedtools=2.29.2
- - conda-forge::libiconv=1.15
+ - bioconda::bedtools=2.30.0
+ - conda-forge::libiconv=1.16
- conda-forge::pigz=2.6
- bioconda::sequencetools=1.4.0.6
- - bioconda::preseq=2.0.3
+ - bioconda::preseq=3.1.2
- bioconda::fastp=0.20.1
- - bioconda::bamutil=1.0.14
+ - bioconda::bamutil=1.0.15
- bioconda::mtnucratio=0.7
- - bioconda::pysam=0.15.4 #Says python3.7 or less
- - bioconda::kraken2=2.1.1
- - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
- - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
+ - bioconda::pysam=0.16.0 #Says python3.7 or less
+ - bioconda::kraken2=2.1.2
+ - conda-forge::pandas=1.2.4 #.4 is python3.8+ compatible
+ - bioconda::freebayes=1.3.5 #should be fine with python 3.8, but says <3.7 on webpage
- bioconda::sexdeterrmine=1.1.2
- bioconda::multivcfanalyzer=0.85.2
- bioconda::hops=0.35
- - conda-forge::biopython=1.76
- - conda-forge::xopen=0.9.0
- - bioconda::bowtie2=2.4.2
+ - conda-forge::biopython=1.79
+ - conda-forge::xopen=1.1.0
+ - bioconda::bowtie2=2.4.4
- bioconda::eigenstratdatabasetools=1.0.2
- - bioconda::mapdamage2=2.2.0
- - bioconda::bbmap=38.87
-
+ - bioconda::mapdamage2=2.2.1
+ - bioconda::bbmap=38.90
diff --git a/nextflow.config b/nextflow.config
index 3e85581f4..9f9b30fc8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -396,7 +396,7 @@ manifest {
description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline'
mainScript = 'main.nf'
nextflowVersion = '>=20.07.1'
- version = '2.3.5dev'
+ version = '2.4dev'
}
// Function to ensure that resource requirements don't go beyond
@@ -430,4 +430,4 @@ def check_max(obj, type) {
return obj
}
}
-}
+}
\ No newline at end of file
From bdfc5a19a94fb0abf6b0d9beca0c1f9981be788b Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 7 Jun 2021 15:33:33 +0200
Subject: [PATCH 15/70] dd additional CI
---
.github/workflows/ci.yml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1ab27185b..45e921907 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -105,6 +105,9 @@ jobs:
- name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming
+ - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming, but skip adapterremoval
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming --skip_adapterremoval
- name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
From a9f18d391ea6eba888776cff478e2a769b934d56 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 7 Jun 2021 19:39:40 +0200
Subject: [PATCH 16/70] Bump vcftools verison to latest
---
environment.yml | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/environment.yml b/environment.yml
index 0a3bbfca4..171d13d61 100644
--- a/environment.yml
+++ b/environment.yml
@@ -36,10 +36,10 @@ dependencies:
- bioconda::fastp=0.20.1
- bioconda::bamutil=1.0.15
- bioconda::mtnucratio=0.7
- - bioconda::pysam=0.16.0 #Says python3.7 or less
+ - bioconda::pysam=0.16.0
- bioconda::kraken2=2.1.2
- - conda-forge::pandas=1.2.4 #.4 is python3.8+ compatible
- - bioconda::freebayes=1.3.5 #should be fine with python 3.8, but says <3.7 on webpage
+ - conda-forge::pandas=1.2.4
+ - bioconda::freebayes=1.3.5
- bioconda::sexdeterrmine=1.1.2
- bioconda::multivcfanalyzer=0.85.2
- bioconda::hops=0.35
@@ -49,4 +49,4 @@ dependencies:
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.1
- bioconda::bbmap=38.90
- - bioconda::bcftools=1.9
+ - bioconda::bcftools=1.12
From ea7f9058c4ef9b09df19bcc0474aab7e4e1917c1 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 7 Jun 2021 20:32:35 +0200
Subject: [PATCH 17/70] Fix script
---
bin/scrape_software_versions.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 0b41a10ec..ed7008381 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -37,7 +37,7 @@
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
- 'bbduk':['v_bbduk.txt',r"(.*)"]
+ 'bbduk':['v_bbduk.txt',r"(.*)"],
'bcftools':['v_bcftools.txt',r"(\S+)"]
}
From 231ae781bb750eb9d91f5173693f17b3ae9739a2 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Tue, 8 Jun 2021 09:50:26 +0200
Subject: [PATCH 18/70] Fix naming conflict when skip AR + running post_ar trim
---
main.nf | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/main.nf b/main.nf
index 2e9173d0d..6056a6fe8 100644
--- a/main.nf
+++ b/main.nf
@@ -931,7 +931,7 @@ if ( params.skip_collapse ){
if (!params.skip_adapterremoval) {
ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
.filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
- .dump(tag: "AR Bypass")
+ .dump(tag: "ar_bypass")
.into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
} else {
ch_fastp_for_skipadapterremoval
@@ -957,11 +957,11 @@ process post_ar_fastq_trimming {
script:
if ( seqtype == 'SE' | (seqtype == 'PE' && !params.skip_collapse) ) {
"""
- fastp --in1 ${r1} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_R1_postartrimmed.fq.gz
+ fastp --in1 ${r1} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz
"""
} else if ( seqtype == 'PE' && params.skip_collapse ) {
"""
- fastp --in1 ${r1} --in2 ${r2} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} --trim_front2 ${params.post_ar_trim_front2} --trim_tail2 ${params.post_ar_trim_tail2} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_R1_postartrimmed.fq.gz --out2 "${libraryid}"_R2_postartrimmed.fq.gz
+ fastp --in1 ${r1} --in2 ${r2} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} --trim_front2 ${params.post_ar_trim_front2} --trim_tail2 ${params.post_ar_trim_tail2} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz --out2 "${libraryid}"_L"${lane}"_R2_postartrimmed.fq.gz
"""
}
@@ -1012,7 +1012,7 @@ if ( params.skip_collapse ){
// Inline barcode removal bypass when not running it
if (params.run_post_ar_trimming) {
ch_post_ar_trimming_for_lanemerge.mix(ch_adapterremoval_for_skip_post_ar_trimming)
- .dump(tag: "Inline Removal Bypass")
+ .dump(tag: "inline_removal_bypass")
.into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
} else {
ch_adapterremoval_for_skip_post_ar_trimming
@@ -1039,7 +1039,7 @@ ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
}
- .dump(tag: "LaneMerge Bypass")
+ .dump(tag: "lanemerge_bypass_decision")
.branch {
skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
merge_me: it[7].size() > 1
@@ -1060,7 +1060,7 @@ ch_branched_for_lanemerge_skipme = ch_branched_for_lanemerge.skip_merge
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
}
- .dump(tag: "LaneMerge Reconfigure")
+ .dump(tag: "lanemerge_reconfigure")
ch_branched_for_lanemerge_ready = ch_branched_for_lanemerge.merge_me
@@ -1088,7 +1088,7 @@ process lanemerge {
publishDir "${params.outdir}/lanemerging", mode: params.publish_dir_mode
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready.dump(tag: "lange_merge_input")
output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_lanemerged.fq.gz") into ch_lanemerge_for_mapping_r1
@@ -1112,7 +1112,7 @@ process lanemerge {
// Ensuring always valid R2 file even if doesn't exist for AWS
if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
ch_lanemerge_for_mapping_r1
- .dump(tag: "Post LaneMerge Reconfigure")
+ .dump(tag: "post_lanemerge_reconfigure")
.mix(ch_lanemerge_for_mapping_r2)
.groupTuple(by: [0,1,2,3,4,5,6])
.map{
@@ -1227,7 +1227,7 @@ process bwa {
publishDir "${params.outdir}/mapping/bwa", mode: params.publish_dir_mode
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "input_tuple")
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "bwa_input_reads")
path index from bwa_index.collect().dump(tag: "input_index")
output:
@@ -1544,7 +1544,7 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging
[ samplename, libraryid, lane, seqtype_new, organism, strandedness, udg, r1, r2 ]
}
- .dump(tag: "Seqtype")
+ .dump(tag: "pre_seqtype_decision")
.branch {
skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
merge_me: it[7].size() > 1
@@ -1927,7 +1927,7 @@ process library_merge {
publishDir "${params.outdir}/merged_bams/initial", mode: params.publish_dir_mode
input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "Input Tuple Library Merge")
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "library_merge_input")
output:
tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, path("*_libmerged_rg_rmdup.bam"), path("*_libmerged_rg_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging
@@ -2435,7 +2435,7 @@ process genotyping_pileupcaller {
file fai from ch_fai_for_pileupcaller.collect()
file dict from ch_dict_for_pileupcaller.collect()
path(bed) from ch_bed_for_pileupcaller.collect()
- path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file")
+ path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "pileupcaller_snp_file")
output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage
From 342ec92bf3971905004259c74c9f510237e05906 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Tue, 8 Jun 2021 10:29:26 +0200
Subject: [PATCH 19/70] Add AR adapterList removal
---
CHANGELOG.md | 11 +++++++++++
main.nf | 23 ++++++++++++++++-------
nextflow.config | 1 +
nextflow_schema.json | 16 ++++++++++++++--
4 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 254ce499f..e6f042ac9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,17 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## v2.4dev - [unreleased
+
+### `Added`
+
+- [](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
+### `Fixed`
+
+### `Dependencies`
+
+### `Deprecated`
+
## v2.3.5 - 2021-06-03
### `Added`
diff --git a/main.nf b/main.nf
index c7807d7a0..a9a15721c 100644
--- a/main.nf
+++ b/main.nf
@@ -227,6 +227,13 @@ if( params.bt2_index && params.mapper == 'bowtie2' ){
bwa_index_bwamem = Channel.empty()
}
+// Adapter removal adapter-list setup
+if ( !params.clip_adapters_list ) {
+ ch_adapterlist = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt")
+} else {
+ ch_adapterlist = Channel.fromPath(params.clip_adapters_list, checkIfExists: true)
+}
+
// SexDetermination channel set up and bedfile validation
if (!params.sexdeterrmine_bedfile) {
ch_bed_for_sexdeterrmine = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt")
@@ -770,12 +777,14 @@ process adapter_removal {
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*pair2.truncated.gz") optional true into ch_output_from_adapterremoval_r2
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*.settings") into ch_adapterremoval_logs
+ path(adapterlist) from ch_adapterlist
when:
!params.skip_adapterremoval
script:
- base = "${r1.baseName}_L${lane}"
+ def base = "${r1.baseName}_L${lane}"
+ def adapters_to_remove = !params.clip_adapter_list ? "--adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor}" : "--adapter-list ${adapterlist}"
//This checks whether we skip trimming and defines a variable respectively
def preserve5p = params.preserve5p ? '--preserve5p' : '' // applies to any AR command - doesn't affect output file combination
@@ -783,7 +792,7 @@ process adapter_removal {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -797,7 +806,7 @@ process adapter_removal {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -810,7 +819,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && !params.preserve5p ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -823,7 +832,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && params.preserve5p ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -864,7 +873,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && params.skip_collapse && !params.skip_trim ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
mv ${base}.pe.pair*.truncated.gz *.settings output/
"""
@@ -872,7 +881,7 @@ process adapter_removal {
//SE, collapse not possible, trim reads only
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
mv *.settings *.se.truncated.gz output/
"""
} else if ( seqtype != 'PE' && params.skip_trim ) {
diff --git a/nextflow.config b/nextflow.config
index c55a565f5..67004dfae 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -66,6 +66,7 @@ params {
//Read clipping and merging parameters
clip_forward_adaptor = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
clip_reverse_adaptor = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'
+ clip_adapters_list = null
clip_readlength = 30
clip_min_read_quality = 20
min_adap_overlap = 1
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 64814061c..9d1dddefb 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -465,6 +465,12 @@
"fa_icon": "fas fa-cut",
"help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`"
},
+ "clip_adapters_list": {
+ "type": "string",
+ "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters",
+ "fa_icon": "fas fa-cut",
+ "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`"
+ },
"clip_readlength": {
"type": "integer",
"default": 30,
@@ -1568,7 +1574,7 @@
"maltextract_percentidentity": {
"type": "number",
"description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.",
- "default": 85,
+ "default": 85.0,
"fa_icon": "fas fa-id-card",
"help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`"
},
@@ -1662,5 +1668,11 @@
{
"$ref": "#/definitions/metagenomic_authentication"
}
- ]
+ ],
+ "properties": {
+ "adapters_list": {
+ "type": "string",
+ "default": "null"
+ }
+ }
}
\ No newline at end of file
From d95f31c3bb7aed0dd4da77e5fcbbb874ca968406 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Tue, 8 Jun 2021 10:32:16 +0200
Subject: [PATCH 20/70] mardown lint
---
CHANGELOG.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e6f042ac9..c0e568555 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,11 +3,12 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## v2.4dev - [unreleased
+## v2.4dev - [unreleased]
### `Added`
- [](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
+
### `Fixed`
### `Dependencies`
From 6807e5c1a60c1ccdfc061f246ec89e593e9c9bbf Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Tue, 8 Jun 2021 10:38:08 +0200
Subject: [PATCH 21/70] Input channel in input and not output :grimace:
---
main.nf | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/main.nf b/main.nf
index a9a15721c..31b1b662d 100644
--- a/main.nf
+++ b/main.nf
@@ -772,19 +772,20 @@ process adapter_removal {
input:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_fastp_for_adapterremoval
+ path(adapterlist) from ch_adapterlist
output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*pair2.truncated.gz") optional true into ch_output_from_adapterremoval_r2
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*.settings") into ch_adapterremoval_logs
- path(adapterlist) from ch_adapterlist
+
when:
!params.skip_adapterremoval
script:
def base = "${r1.baseName}_L${lane}"
- def adapters_to_remove = !params.clip_adapter_list ? "--adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor}" : "--adapter-list ${adapterlist}"
+ def adapters_to_remove = !params.clip_adapters_list ? "--adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor}" : "--adapter-list ${adapterlist}"
//This checks whether we skip trimming and defines a variable respectively
def preserve5p = params.preserve5p ? '--preserve5p' : '' // applies to any AR command - doesn't affect output file combination
@@ -792,7 +793,7 @@ process adapter_removal {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -806,7 +807,7 @@ process adapter_removal {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -819,7 +820,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && !params.preserve5p ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -832,7 +833,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && params.preserve5p ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz
@@ -873,7 +874,7 @@ process adapter_removal {
} else if ( seqtype == 'PE' && params.skip_collapse && !params.skip_trim ) {
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
mv ${base}.pe.pair*.truncated.gz *.settings output/
"""
@@ -881,7 +882,7 @@ process adapter_removal {
//SE, collapse not possible, trim reads only
"""
mkdir -p output
- AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities "${adapters_to_remove}" --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
+ AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities ${adapters_to_remove} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}
mv *.settings *.se.truncated.gz output/
"""
} else if ( seqtype != 'PE' && params.skip_trim ) {
From 61c1d7f3f75412bc876a43261f26430f0df38afd Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Wed, 9 Jun 2021 20:43:27 +0200
Subject: [PATCH 22/70] Update CHANGELOG.md
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c0e568555..1b24b248a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
-- [](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
+- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
### `Fixed`
From f10980b62c99e2819191df258b729b4cc6fc4335 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 9 Jun 2021 20:57:37 +0200
Subject: [PATCH 23/70] Fix file collison when not using adapaterlist on
single-end data
---
main.nf | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/main.nf b/main.nf
index 31b1b662d..5f2fc1cb1 100644
--- a/main.nf
+++ b/main.nf
@@ -229,7 +229,7 @@ if( params.bt2_index && params.mapper == 'bowtie2' ){
// Adapter removal adapter-list setup
if ( !params.clip_adapters_list ) {
- ch_adapterlist = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt")
+ ch_adapterlist = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy2.txt")
} else {
ch_adapterlist = Channel.fromPath(params.clip_adapters_list, checkIfExists: true)
}
From 3d85b00e15ea031ba5140cbf0784805806565391 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 9 Jun 2021 21:35:37 +0200
Subject: [PATCH 24/70] Fix adapter list being consumed by applying .collect()
---
main.nf | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/main.nf b/main.nf
index 5f2fc1cb1..8430c8f71 100644
--- a/main.nf
+++ b/main.nf
@@ -229,11 +229,18 @@ if( params.bt2_index && params.mapper == 'bowtie2' ){
// Adapter removal adapter-list setup
if ( !params.clip_adapters_list ) {
- ch_adapterlist = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy2.txt")
+ Channel
+ .fromPath("$projectDir/assets/nf-core_eager_dummy2.txt", checkIfExists: true)
+ .ifEmpty { exit 1, "[nf-core/eager] error: adapters list file not found. Please check input. Supplied: --clip_adapters_list '${params.clip_adapters_list}'." }
+ .into {ch_adapterlist}
} else {
- ch_adapterlist = Channel.fromPath(params.clip_adapters_list, checkIfExists: true)
+ Channel
+ .fromPath("${params.clip_adapters_list}", checkIfExists: true)
+ .ifEmpty { exit 1, "[nf-core/eager] error: adapters list file not found. Please check input. Supplied: --clip_adapters_list '${params.clip_adapters_list}'." }
+ .into {ch_adapterlist}
}
+
// SexDetermination channel set up and bedfile validation
if (!params.sexdeterrmine_bedfile) {
ch_bed_for_sexdeterrmine = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt")
@@ -772,14 +779,13 @@ process adapter_removal {
input:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_fastp_for_adapterremoval
- path(adapterlist) from ch_adapterlist
+ path adapterlist from ch_adapterlist.collect().dump(tag: "Adapter list")
output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*pair2.truncated.gz") optional true into ch_output_from_adapterremoval_r2
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*.settings") into ch_adapterremoval_logs
-
when:
!params.skip_adapterremoval
From 6741b0079bc5c667f422b90352d4d5609af2c28b Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 9 Jun 2021 21:52:59 +0200
Subject: [PATCH 25/70] Remove accidently merged pre-fastq triming file from
channel of trimmed files
---
main.nf | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/main.nf b/main.nf
index 6056a6fe8..bf23ebfcf 100644
--- a/main.nf
+++ b/main.nf
@@ -930,11 +930,13 @@ if ( params.skip_collapse ){
// AdapterRemoval bypass when not running it
if (!params.skip_adapterremoval) {
ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
+ .dump(tag: "post_ar_adapterremoval_decision_skipar")
.filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
.dump(tag: "ar_bypass")
.into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
} else {
ch_fastp_for_skipadapterremoval
+ .dump(tag: "post_ar_adapterremoval_decision_withar")
.into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
}
@@ -1011,7 +1013,7 @@ if ( params.skip_collapse ){
// Inline barcode removal bypass when not running it
if (params.run_post_ar_trimming) {
- ch_post_ar_trimming_for_lanemerge.mix(ch_adapterremoval_for_skip_post_ar_trimming)
+ ch_adapterremoval_for_skip_post_ar_trimming
.dump(tag: "inline_removal_bypass")
.into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
} else {
@@ -1019,8 +1021,6 @@ if (params.run_post_ar_trimming) {
.into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
}
-
-
// Lane merging for libraries sequenced over multiple lanes (e.g. NextSeq)
ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
.groupTuple(by: [0,1,3,4,5,6])
From 29b01700c7f8bbd2a74d950f50583d2784bee8d2 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 10 Jun 2021 10:09:32 +0200
Subject: [PATCH 26/70] Add CI tests
---
.github/workflows/ci.yml | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 153a4befd..63fd7aef0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -102,6 +102,12 @@ jobs:
- name: ADAPTERREMOVAL Run the basic pipeline with preserve5p end and merged reads only options
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preserve5p --mergedonly
+ - name: ADAPTER LIST Run the basic pipeline using an adapter list
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
+ - name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list --skip_adapterremoval 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
- name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
From 2631c95c005f896273a63db25dff19a7d84f997a Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 10 Jun 2021 11:03:20 +0200
Subject: [PATCH 27/70] Put list in right place for CI
---
.github/workflows/ci.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 63fd7aef0..d24e540a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -107,7 +107,7 @@ jobs:
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
- name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list --skip_adapterremoval 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt'
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval
- name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
From 863570e38d7a4e82da1782536f95e35e0a6b8c6d Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 11 Jun 2021 13:31:10 +0200
Subject: [PATCH 28/70] Update ci.yml
---
.github/workflows/ci.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 45e921907..264d63f4c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,7 +59,7 @@ jobs:
git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data
- name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
run: |
- if [[ $NXF_VER = '' ]]; then sleep 360; fi
+ if [[ $NXF_VER = '' ]]; then sleep 600; fi
- name: BASIC Run the basic pipeline with directly supplied single-end FASTQ
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input 'data/testdata/Mammoth/fastq/*_R1_*.fq.gz' --single_end
@@ -200,4 +200,4 @@ jobs:
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
- name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
\ No newline at end of file
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
From 707b5bcb0edac605fb2a245a28ce977ce984e56a Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 11 Jun 2021 14:37:00 +0200
Subject: [PATCH 29/70] Update ci.yml
---
.github/workflows/ci.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 264d63f4c..7b0315a4b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,7 +59,7 @@ jobs:
git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data
- name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
run: |
- if [[ $NXF_VER = '' ]]; then sleep 600; fi
+ if [[ $NXF_VER = '' ]]; then sleep 1200; fi
- name: BASIC Run the basic pipeline with directly supplied single-end FASTQ
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input 'data/testdata/Mammoth/fastq/*_R1_*.fq.gz' --single_end
From 5cf7bbcf159893b2aed62afe95195cd459ca6bba Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Wed, 16 Jun 2021 17:58:50 +0200
Subject: [PATCH 30/70] Update nextflow_schema.json
---
nextflow_schema.json | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 64814061c..bf4ea4551 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -16,7 +16,6 @@
"properties": {
"input": {
"type": "string",
- "default": "null",
"description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.",
"fa_icon": "fas fa-dna",
"help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications."
@@ -1663,4 +1662,4 @@
"$ref": "#/definitions/metagenomic_authentication"
}
]
-}
\ No newline at end of file
+}
From a9190981ff67e3933c499a4897f732a0f69df3ce Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Wed, 16 Jun 2021 18:04:55 +0200
Subject: [PATCH 31/70] Update CHANGELOG.md
---
CHANGELOG.md | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 254ce499f..e6f419d9f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,14 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## v2.4dev - [unreleased]
+
+### `Added`
+
+### `Fixed`
+
+- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
+
## v2.3.5 - 2021-06-03
### `Added`
From 81edc7b1a1ba790f3d8a2d8578f52c13f56679f9 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Thu, 17 Jun 2021 09:26:40 +0200
Subject: [PATCH 32/70] Improve endogenous post description
---
CHANGELOG.md | 4 +++-
docs/output.md | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2f082123..1afb4d64f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,12 +3,14 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## v2.3.6dev - [unreleased]
+## v2.4dev - [unreleased]
### `Added`
### `Fixed`
+- Improved output documentation for MultiQC general stats table
+
### `Dependencies`
### `Deprecated`
diff --git a/docs/output.md b/docs/output.md
index cc07d9a69..d37b183c9 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -76,7 +76,7 @@ The possible columns displayed by default are as follows:
* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering.
* **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content.
* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second)
-* **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (e.g. for mapping quality) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM.
+* **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM.
* **ClusterFactor** This is from DeDup. This is a value representing the how many duplicates in the library exist for each unique read. A cluster factor close to one replicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper.
* **Dups** This is from Picard's markDuplicates. It represents the percentage of reads in your library that were exact duplicates of other reads in your database. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective).
* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base.
From 7ebe4256716e9031f3b059d1ee649d5066648d16 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Thu, 17 Jun 2021 11:43:04 +0200
Subject: [PATCH 33/70] Update CHANGELOG.md
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1afb4d64f..9a571bdef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Fixed`
-- Improved output documentation for MultiQC general stats table
+- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele)
### `Dependencies`
From d1d3275d540a309083659045437ef48c8f2230ab Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Thu, 24 Jun 2021 13:49:55 +0200
Subject: [PATCH 34/70] Update usage.md
---
docs/usage.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/usage.md b/docs/usage.md
index c683a473c..701a2acff 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -111,7 +111,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof
* Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud.
* A generic configuration profile to be used with [Conda](https://conda.io/docs/)
* Pulls most software from [Bioconda](https://bioconda.github.io/)
-* `test_tsv
+* `test_tsv`
* A profile with a complete configuration for automated testing
* Includes links to test data so needs no other parameters
From be66202a097d76aa9562c803f23f1b812eb6aab7 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Tue, 6 Jul 2021 08:31:50 +0200
Subject: [PATCH 35/70] Update README.md
---
README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README.md b/README.md
index 193a88d96..b2a42a8e9 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,13 @@
[](https://www.nextflow.io/)
[](https://nf-co.re/)
[](https://zenodo.org/badge/latestdoi/135918251)
+[](https://peerj.com/articles/10947/)
[](https://bioconda.github.io/)
[](https://hub.docker.com/r/nfcore/eager)

+
[](https://nfcore.slack.com/channels/eager)
## Introduction
From de1bbecf33542af633d577549745db6e9d47bfb9 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Tue, 6 Jul 2021 08:32:42 +0200
Subject: [PATCH 36/70] Update README.md
---
README.md | 1 -
1 file changed, 1 deletion(-)
diff --git a/README.md b/README.md
index b2a42a8e9..ac9ef2e67 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@
[](https://hub.docker.com/r/nfcore/eager)

-
[](https://nfcore.slack.com/channels/eager)
## Introduction
From 8ae027e42caea8d6800acf9d2a521bdcb4fce237 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Mon, 12 Jul 2021 09:18:47 +0200
Subject: [PATCH 37/70] Update input TSV based on feedback from Elina
---
docs/usage.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/docs/usage.md b/docs/usage.md
index c683a473c..d3dfa892b 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -348,7 +348,10 @@ will have the following effects:
Note the following important points and limitations for setting up:
* The TSV must use actual tabs (not spaces) between cells.
+* The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames.
* *File* names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)).
+ * At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and `Lane` column columns for filenames.
+ * In other words, your .tsv file must not have rows with `Library1` and `Library1` for both `SampleA` and `SampleB`. While nf-core/eager would not try to _merge_ these, in some stages of the pipeline output files names would be the same, and would overwrite the other if the files are output to the same `results/` subdirectory.
* If it is 'too late' and you already have duplicate file names, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file.
* Lane IDs must be unique for each sequencing of each library.
* If you have a library sequenced e.g. on Lane 8 of two HiSeq runs, you can give a fake lane ID (e.g. 20) for one of the FASTQs, and the libraries will still be processed correctly.
From ac1c4ed0d8b8cb63bf2231984d63815c20f23d8f Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 16 Jul 2021 08:36:51 +0200
Subject: [PATCH 38/70] Update usage.md
---
docs/usage.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/usage.md b/docs/usage.md
index d3dfa892b..8035254c3 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -361,11 +361,11 @@ Note the following important points and limitations for setting up:
* nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration
* Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together.
* Same libraries that are sequenced on different sequencing configurations (i.e single- and paired-end data), will be merged after mapping and will _always_ be considered 'paired-end' during downstream processes
- * **Important** running DeDup in this context is _not_ recommended, as PE and SE data at the same position will _not_ be evaluated as duplicates. Therefore not all duplicates will be removed.
+ * **Important** running deduplication using DeDup (via `--dedupper 'dedup'`) in this context is not recommended, as PE and SE data at the same position will not be evaluated as duplicates. Therefore not all duplicates will be removed.
* When you wish to run PE/SE data together `-dedupper markduplicates` is therefore preferred.
* An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`.
* If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager
- * If you _regularly_ want to run the situation above, please leave a feature request on github.
+ * If you _regularly_ want to run the situation above, please leave a feature request on github.
* DamageProfiler, NuclearContamination, MTtoNucRatio and PreSeq are performed on each unique library separately after deduplication (but prior same-treated library merging).
* nf-core/eager functionality such as `--run_trim_bam` will be applied to only non-UDG (UDG_Treatment: none) or half-UDG (UDG_Treatment: half) libraries. - Qualimap is run on each sample, after merging of libraries (i.e. your values will reflect the values of all libraries combined - after being damage trimmed etc.).
* Genotyping will be typically performed on each `sample` independently, as normally all libraries will have been merged together. However, if you have a mixture of single-stranded and double-stranded libraries, you will normally need to genotype separately. In this case you **must** give each the SS and DS libraries _distinct_ `Sample_IDs`; otherwise you will receive a `file collision` error in steps such as `sexdeterrmine`, and then you will need to merge these yourself. We will consider changing this behaviour in the future if there is enough interest.
From 4ac59bf9a500b45828fb984176594898b457f2fd Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 16 Jul 2021 08:37:37 +0200
Subject: [PATCH 39/70] Update CHANGELOG.md
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a571bdef..2ccda3b18 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Fixed`
-- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele)
+- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
### `Dependencies`
From c51b79459b8bb0fb51dea75bc114682d9cd79654 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Fri, 16 Jul 2021 14:20:49 +0200
Subject: [PATCH 40/70] Update usage.md
---
docs/usage.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/usage.md b/docs/usage.md
index 8035254c3..556df26fe 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -360,9 +360,9 @@ Note the following important points and limitations for setting up:
* You should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses.
* nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration
* Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together.
-* Same libraries that are sequenced on different sequencing configurations (i.e single- and paired-end data), will be merged after mapping and will _always_ be considered 'paired-end' during downstream processes
- * **Important** running deduplication using DeDup (via `--dedupper 'dedup'`) in this context is not recommended, as PE and SE data at the same position will not be evaluated as duplicates. Therefore not all duplicates will be removed.
- * When you wish to run PE/SE data together `-dedupper markduplicates` is therefore preferred.
+* nf-core/eager is able to correctly handle libraries that are sequenced multiple times on different sequencing configurations (i.e mixtures of single- and paired-end data). These will be merged after mapping and considered 'paired-end' during downstream processes.
+ * **Important** we do not recommend choosing to use DeDup (i.e. `--dedupper 'dedup'`) when mixing PE and SE data, as SE data will not necessarily have the correct end position of the read, and DeDup requires both ends of the molecule to remove a duplicate read. Therefore you may end up with inflated (false-positive) coverages due to suboptimal deduplication.
+ * When you wish to run PE/SE data together, the default `-dedupper markduplicates` is therefore preferred, as it only looks at the first position. While more conservative (i.e. it'll remove more reads even if not technically duplicates, because it assumes it can't see the true ends of molecules), it is more consistent.
* An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`.
* If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager
* If you _regularly_ want to run the situation above, please leave a feature request on github.
From 176a0852c0978e7aa9805ea3687c01126868f764 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 26 Jul 2021 09:56:36 +0200
Subject: [PATCH 41/70] Add LC_Extrap mode for @robert-davidson
---
.github/workflows/ci.yml | 3 +++
.github/workflows/linting.yml | 2 +-
assets/multiqc_config.yaml | 4 +++-
docs/output.md | 2 +-
main.nf | 33 ++++++++++++++++++++++-------
nextflow.config | 5 +++++
nextflow_schema.json | 39 +++++++++++++++++++++++++++++++++++
7 files changed, 77 insertions(+), 11 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 213c2ac69..d9357a884 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -120,6 +120,9 @@ jobs:
- name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
+ - name: PRESEQ Run basic mapping pipeline with different preseq mode
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --preseq_mode 'lc_extrap' --preseq_maxextrap 10000 --preseq_bootstrap 10
- name: DEDUPLICATION Test with dedup
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index fcde400ce..83a8bc100 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -107,7 +107,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install nf-core
+ pip install nf-core==1.14
- name: Run nf-core lint
env:
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
index 060c92028..021b86646 100644
--- a/assets/multiqc_config.yaml
+++ b/assets/multiqc_config.yaml
@@ -86,7 +86,9 @@ top_modules:
- '*_postfilterflagstat.stats'
- 'dedup'
- 'picard'
- - 'preseq'
+ - 'preseq':
+ path_filters:
+ - '*.preseq'
- 'damageprofiler'
- 'mtnucratio'
- 'qualimap'
diff --git a/docs/output.md b/docs/output.md
index cc07d9a69..17cb03792 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -653,7 +653,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
* `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you.
-* `preseq/`: this contains a `.ccurve` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
+* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth.
* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads.
* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
diff --git a/main.nf b/main.nf
index c7807d7a0..9f2517b98 100644
--- a/main.nf
+++ b/main.nf
@@ -46,10 +46,15 @@ if ( params.skip_collapse && params.skip_trim ) {
}
// Bedtools validation
-if(params.run_bedtools_coverage && !params.anno_file ){
+if( params.run_bedtools_coverage && !params.anno_file ){
exit 1, "[nf-core/eager] error: you have turned on bedtools coverage, but not specified a BED or GFF file with --anno_file. Please validate your parameters."
}
+// Bedtools validation
+if( !params.skip_preseq && !( params.preseq_mode == 'c_curve' || params.preseq_mode == 'lc_extrap' ) ) {
+ exit 1, "[nf-core/eager] error: you are running preseq with a unsupported mode. See documentation for more information. You gave: ${params.preseq_mode}."
+}
+
// BAM filtering validation
if (!params.run_bam_filtering && params.bam_mapping_quality_threshold != 0) {
exit 1, "[nf-core/eager] error: please turn on BAM filtering if you want to perform mapping quality filtering! Provide: --run_bam_filtering."
@@ -1899,21 +1904,33 @@ process preseq {
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(input) from ch_input_for_preseq
output:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.ccurve") into ch_preseq_for_multiqc
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${input.baseName}.preseq") into ch_preseq_for_multiqc
script:
pe_mode = params.skip_collapse && seqtype == "PE" ? '-P' : ''
- if(!params.skip_deduplication && params.dedupper == "dedup"){
+ if(!params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "dedup"){
+ """
+ preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input}
+ """
+ } else if( !params.skip_deduplication && params.preseq_mode == 'c_curve' && params.dedupper == "markduplicates"){
+ """
+ preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
+ """
+ } else if ( params.skip_deduplication && params.preseq_mode == 'c_curve' ) {
+ """
+ preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode}
+ """
+ } else if(!params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "dedup"){
"""
- preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -H ${input}
+ preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -H ${input} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
"""
- } else if( !params.skip_deduplication && params.dedupper == "markduplicates"){
+ } else if( !params.skip_deduplication && params.preseq_mode == 'lc_extrap' && params.dedupper == "markduplicates"){
"""
- preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
+ preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
"""
- } else if ( params.skip_deduplication ) {
+ } else if ( params.skip_deduplication && params.preseq_mode == 'lc_extrap' ) {
"""
- preseq c_curve -s ${params.preseq_step_size} -o ${input.baseName}.ccurve -B ${input} ${pe_mode}
+ preseq lc_extrap -s ${params.preseq_step_size} -o ${input.baseName}.preseq -B ${input} ${pe_mode} -n ${params.preseq_bootstrap} -e ${params.preseq_maxextrap} -cval ${params.preseq_cval} -x ${params.preseq_terms}
"""
}
}
diff --git a/nextflow.config b/nextflow.config
index 9f9b30fc8..a619bf3d6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -108,6 +108,11 @@ params {
//Preseq settings
preseq_step_size = 1000
+ preseq_mode = 'c_curve'
+ preseq_bootstrap = 100
+ preseq_maxextrap = 10000000000
+ preseq_cval = 0.95
+ preseq_terms = 100
//DamageProfiler settings
damageprofiler_length = 100
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 64814061c..823bbac0a 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -762,12 +762,51 @@
"description": "Options for calculating library complexity (i.e. how many unique reads are present).",
"default": "",
"properties": {
+ "preseq_mode": {
+ "type": "string",
+ "default": "c_curve",
+ "description": "Specify which mode of preseq to run.",
+ "fa_icon": "fas fa-toggle-on",
+ "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram",
+ "enum": [
+ "c_curve",
+ "lc_extrap"
+ ]
+ },
"preseq_step_size": {
"type": "integer",
"default": 1000,
"description": "Specify the step size of Preseq.",
"fa_icon": "fas fa-shoe-prints",
"help_text": "Can be used to configure the step size of Preseq's `c_curve` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve parameter: `-s`"
+ },
+ "preseq_maxextrap": {
+ "type": "integer",
+ "default": 10000000000,
+ "description": "Specify the maximum extrapolation (lc_extrap mode only)",
+ "fa_icon": "fas fa-ban",
+ "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`"
+ },
+ "preseq_terms": {
+ "type": "integer",
+ "default": 100,
+ "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)",
+ "fa_icon": "fas fa-sort-numeric-up-alt",
+ "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`"
+ },
+ "preseq_bootstrap": {
+ "type": "integer",
+ "default": 100,
+ "description": "Specify number of bootstraps to perform (lc_extrap mode only)",
+ "fa_icon": "fab fa-bootstrap",
+ "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`"
+ },
+ "preseq_cval": {
+ "type": "number",
+ "default": 0.95,
+ "description": "Specify confidence interval level (lc_extrap mode only)",
+ "fa_icon": "fas fa-check-circle",
+ "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`"
}
},
"fa_icon": "fas fa-bezier-curve",
From 58de9fc5e850fbee9e2539fe22eb6743d33466cd Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 26 Jul 2021 09:59:56 +0200
Subject: [PATCH 42/70] Updated changelog
---
CHANGELOG.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2f082123..e87ccd0af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
+- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
+
### `Fixed`
### `Dependencies`
From 9207374b2ca7fc50f80a2934f7f03ccfbdafee50 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 26 Jul 2021 10:07:01 +0200
Subject: [PATCH 43/70] update changelgo
---
CHANGELOG.md | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3907b4cde..ec1b2d98e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,19 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## v2.4dev - [unreleased]
+
+### `Added`
+
+- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
+### `Fixed`
+
+- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
+- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
+
+### `Dependencies`
+
+### `Deprecated`
## v2.3.5dev - [date]
### `Added`
From e27a8a1e7c16a994f2c3d293cba1839b5bb141dd Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 26 Jul 2021 10:13:38 +0200
Subject: [PATCH 44/70] Fix schema json
---
nextflow_schema.json | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 9d1dddefb..3fab54e2a 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -16,7 +16,6 @@
"properties": {
"input": {
"type": "string",
- "default": "null",
"description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.",
"fa_icon": "fas fa-dna",
"help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications."
@@ -1574,7 +1573,7 @@
"maltextract_percentidentity": {
"type": "number",
"description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.",
- "default": 85.0,
+ "default": 85,
"fa_icon": "fas fa-id-card",
"help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`"
},
@@ -1668,11 +1667,5 @@
{
"$ref": "#/definitions/metagenomic_authentication"
}
- ],
- "properties": {
- "adapters_list": {
- "type": "string",
- "default": "null"
- }
- }
+ ]
}
\ No newline at end of file
From 3ad0668c17e2296a85698ea75b193ef5bfb76f37 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Wed, 28 Jul 2021 09:18:53 +0200
Subject: [PATCH 45/70] Update output.md
---
docs/output.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/output.md b/docs/output.md
index d37b183c9..8a7bdadab 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -333,7 +333,7 @@ Ancient DNA samples typically have low endogenous DNA values, as most of the DNA
-The main additional useful information compared to [Samtools](#samtools) is that these plots can inform you how many reads had multiple places on the reference the read could align to. This can occur with low complexity reads or reads derived from e.g. repetitive regions on the genome. If you have large amounts of multi-mapping reads, this can be a warning flag that there is an issue either with the reference genome or library itself (e.g. over-amplification of low-complexity regions or library construction artefacts). You should investigate cases like this more closely before using the data downstream.
+The main additional useful information compared to [Samtools](#samtools) is that these plots can inform you how many reads had multiple places on the reference the read could align to. This can occur with low complexity reads or reads derived from e.g. repetitive regions on the genome. If you have large amounts of multi-mapping reads, this can be a warning flag that there is an issue either with the reference genome or library itself (e.g. library construction artefacts). You should investigate cases like this more closely before using the data downstream.
### MALT
From 230aed68339423be16356f36c76b1d07738c7270 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Wed, 11 Aug 2021 15:34:16 +0200
Subject: [PATCH 46/70] Update CHANGELOG.md
---
CHANGELOG.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32523330c..9f571933e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
+- Improved output documentation for BowTie2 (thanks to @isinaltinkaya)
### `Dependencies`
From 54396a8066cd4940e4b73ebf2bb98e75d2e074d2 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Mon, 23 Aug 2021 13:39:38 +0200
Subject: [PATCH 47/70] Update contributors list
---
README.md | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index ac9ef2e67..7adeae6f2 100644
--- a/README.md
+++ b/README.md
@@ -161,17 +161,21 @@ Those who have provided conceptual guidance, suggestions, bug reports etc.
* [Alexandre Gilardet](https://github.com/alexandregilardet)
* Arielle Munters
-* [Charles Plessy](https://github.com/charles-plessy)
* [Åshild Vågene](https://github.com/ashildv)
+* [Charles Plessy](https://github.com/charles-plessy)
+* [Elina Salmela](https://github.com/esalmela)
* [Hester van Schalkwyk](https://github.com/hesterjvs)
* [Ido Bar](https://github.com/IdoBar)
* [Irina Velsko](https://github.com/ivelsko)
+* [Işın Altınkaya](https://github.com/isinaltinkaya)
* [Katerine Eaton](https://github.com/ktmeaton)
+* [Katrin Nägele](https://github.com/KathrinNaegele)
* [Luc Venturini](https://github.com/lucventurini)
* [Marcel Keller](https://github.com/marcel-keller)
* [Pierre Lindenbaum](https://github.com/lindenb)
* [Pontus Skoglund](https://github.com/pontussk)
* [Raphael Eisenhofer](https://github.com/EisenRa)
+* [Roberta Davidson](https://github.com/roberta-davidson)
* [Torsten Günter](https://bitbucket.org/tguenther/)
* [Kevin Lord](https://github.com/lordkev)
* [He Yu](https://github.com/paulayu)
From a68b9d45c61dca76d41194d0cf0de30fc4c6748a Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Mon, 23 Aug 2021 20:27:13 +0200
Subject: [PATCH 48/70] Update docs/output.md
Co-authored-by: Alexander Peltzer
---
docs/output.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/output.md b/docs/output.md
index b3030d6b1..9fd7280d4 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -622,7 +622,7 @@ If this correlation is not observed, your data is skewed towards higher coverage
### Background
-Bcftools is a toolkit for processing and summaries VCF files, i.e. variant call format files. nf-core/eager currently uses bcftools for the `stats` functionality. This summarises in a text file a range of statistics about VCF files, produced by GATK and FreeBayes variant callers.
+Bcftools is a toolkit for processing and summarising of VCF files, i.e. variant call format files. nf-core/eager currently uses bcftools for the `stats` functionality. This summarises in a text file a range of statistics about VCF files, produced by GATK and FreeBayes variant callers.
#### Variant Substitution Types
From d51783ead3056fc43ae5d7926ac4c4e3d6e56a7b Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 23 Aug 2021 21:25:03 +0200
Subject: [PATCH 49/70] Delete useless block
---
main.nf | 21 +++------------------
1 file changed, 3 insertions(+), 18 deletions(-)
diff --git a/main.nf b/main.nf
index d43fec147..98c88688d 100644
--- a/main.nf
+++ b/main.nf
@@ -2213,24 +2213,8 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
}
-
-
// Unified Genotyper - although not-supported, better for aDNA (because HC does de novo assembly which requires higher coverages), and needed for MultiVCFAnalyzer
-// initialise empty bcftool related empty channels
-
-//if ( params.genotyping_tool == 'ug' ) {
-// ch_hc_for_bcftools_stats = Channel.empty()
-// ch_fb_for_bcftools_stats = Channel.empty()
-//} else if ( params.genotyping_tool == 'hc' ) {
-// ch_ug_for_bcftools_stats = Channel.empty()
-// ch_fb_for_bcftools_stats = Channel.empty()
-//} else if ( params.genotyping_tool == 'fb ') {
-// ch_ug_for_bcftools_stats = Channel.empty()
-// ch_hc_for_bcftools_stats = Channel.empty()
-//}
-
-
process genotyping_ug {
label 'mc_small'
tag "${samplename}"
@@ -2517,7 +2501,7 @@ process vcf2genome {
pigz -d -f -p ${task.cpus} ${vcf}
vcf2genome -Xmx${task.memory.toGiga()}g -draft ${out} -draftname "${fasta_head}" -in ${vcf.baseName} -minc ${params.vcf2genome_minc} -minfreq ${params.vcf2genome_minfreq} -minq ${params.vcf2genome_minq} -ref ${fasta} -refMod ${out}_refmod.fasta -uncertain ${out}_uncertainty.fasta
pigz -f -p ${task.cpus} ${out}*
- pigz -p ${task.cpus} *.vcf
+ bgzip -@ ${task.cpus} *.vcf
"""
}
@@ -2560,6 +2544,7 @@ process multivcfanalyzer {
pigz -d -f -p ${task.cpus} ${vcf}
multivcfanalyzer -Xmx${task.memory.toGiga()}g ${params.snp_eff_results} ${fasta} ${params.reference_gff_annotations} . ${write_freqs} ${params.min_genotype_quality} ${params.min_base_coverage} ${params.min_allele_freq_hom} ${params.min_allele_freq_het} ${params.reference_gff_exclude} *.vcf
pigz -p ${task.cpus} *.tsv *.txt snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta fullAlignment.fasta
+ bgzip -@ ${task.cpus} *.vcf
"""
}
@@ -2616,7 +2601,7 @@ process sexdeterrmine_prep {
process sexdeterrmine {
label 'mc_small'
publishDir "${params.outdir}/sex_determination", mode: params.publish_dir_mode
-
+
input:
path bam from ch_prepped_for_sexdeterrmine.collect()
path(bed) from ch_bed_for_sexdeterrmine
From 91c51f48ea14f97e889a193fc28e21c162b03227 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 23 Aug 2021 21:38:07 +0200
Subject: [PATCH 50/70] Update best practise defults and bam trimming defaults
---
CHANGELOG.md | 2 ++
nextflow.config | 14 +++++++-------
nextflow_schema.json | 12 ++++++------
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c531715b..3a72f5728 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
- Improved output documentation for BowTie2 (thanks to @isinaltinkaya)
+- [#612](https://github.com/nf-core/eager/issues/612) Updated BAM trimming defaults to 0 to ensure no unwanted trimming when mixing half-UDG with no-UDG (thanks to @scarlhoff)
+- [#722](https://github.com/nf-core/eager/issues/722) Updated BWA mapping mapping parameters to latest recommendations - primarily alnn back to 0.01 and aln0 to 2 as per Oliva et al. 2021 (10.1093/bib/bbab076)
### `Dependencies`
diff --git a/nextflow.config b/nextflow.config
index 0a405044d..c7a8d8703 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -83,10 +83,10 @@ params {
//Mapping algorithm
mapper = 'bwaaln'
- bwaalnn = 0.04
+ bwaalnn = 0.01 // From Oliva et al. 2021 (10.1093/bib/bbab076)
bwaalnk = 2
- bwaalnl = 1024 // From Schubert et al. 2012 (10.1186/1471-2164-13-178)
- bwaalno = 1 // leave at bwa default for now
+ bwaalnl = 1024 // From Oliva et al. 2021 (10.1093/bib/bbab076)
+ bwaalno = 2 // From Oliva et al. 2021 (10.1093/bib/bbab076)
circularextension = 500
circulartarget = 'MT'
circularfilter = false
@@ -144,10 +144,10 @@ params {
//bamUtils trimbam settings
run_trim_bam = false
- bamutils_clip_half_udg_left = 1
- bamutils_clip_half_udg_right = 1
- bamutils_clip_none_udg_left = 1
- bamutils_clip_none_udg_right = 1
+ bamutils_clip_half_udg_left = 0
+ bamutils_clip_half_udg_right = 0
+ bamutils_clip_none_udg_left = 0
+ bamutils_clip_none_udg_right = 0
bamutils_softclip = false
//Genotyping options
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 4ae9d6891..d25b1ba30 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -577,7 +577,7 @@
},
"bwaalnn": {
"type": "number",
- "default": 0.04,
+ "default": 0.01,
"description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.",
"fa_icon": "fas fa-sort-numeric-down",
"help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`"
@@ -598,7 +598,7 @@
},
"bwaalno": {
"type": "integer",
- "default": 1,
+ "default": 2,
"fa_icon": "fas fa-people-arrows",
"description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.",
"help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n"
@@ -966,28 +966,28 @@
},
"bamutils_clip_half_udg_left": {
"type": "integer",
- "default": 1,
+ "default": 0,
"fa_icon": "fas fa-ruler-combined",
"description": "Specify the number of bases to clip off reads from 'left' end of read for half-UDG libraries.",
"help_text": "Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`"
},
"bamutils_clip_half_udg_right": {
"type": "integer",
- "default": 1,
+ "default": 0,
"fa_icon": "fas fa-ruler",
"description": "Specify the number of bases to clip off reads from 'right' end of read for half-UDG libraries.",
"help_text": "Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`"
},
"bamutils_clip_none_udg_left": {
"type": "integer",
- "default": 1,
+ "default": 0,
"fa_icon": "fas fa-ruler-combined",
"description": "Specify the number of bases to clip off reads from 'left' end of read for non-UDG libraries.",
"help_text": "Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`"
},
"bamutils_clip_none_udg_right": {
"type": "integer",
- "default": 1,
+ "default": 0,
"fa_icon": "fas fa-ruler",
"description": "Specify the number of bases to clip off reads from 'right' end of read for non-UDG libraries.",
"help_text": "Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`"
From 1d42a5caa393defcf4d87458cf5c769d011fb7c8 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 23 Aug 2021 22:03:12 +0200
Subject: [PATCH 51/70] Changelog linting
---
CHANGELOG.md | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 178c6c981..7e8c71f2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,16 +8,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`
- [#317](https://github.com/nf-core/eager/issues/317) Added bcftools stats for general genotyping statistics of VCF files
-
-### `Fixed`
-
-- Fixed some missing or incorrectly reported software versions
- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
-- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
+- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
### `Fixed`
+- Fixed some missing or incorrectly reported software versions
- [#771](https://github.com/nf-core/eager/issues/771) Remove legacy code
- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
- Improved output documentation for BowTie2 (thanks to @isinaltinkaya)
From 5ce424a32bc9aa4a33aba8d3b039e9f3876d19b7 Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Mon, 23 Aug 2021 22:08:22 +0200
Subject: [PATCH 52/70] A few version bumps
---
environment.yml | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/environment.yml b/environment.yml
index 171d13d61..4a65d0a7e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
- bioconda::adapterremoval=2.3.2
- bioconda::adapterremovalfixprefix=0.0.5
- bioconda::bwa=0.7.17
- - bioconda::picard=2.25.5
+ - bioconda::picard=2.26.0
- bioconda::samtools=1.12
- bioconda::dedup=0.12.8
- bioconda::angsd=0.935
@@ -26,7 +26,7 @@ dependencies:
- bioconda::qualimap=2.2.2d
- bioconda::vcf2genome=0.91
- bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8
- - bioconda::multiqc=1.10.1
+ - bioconda::multiqc=1.11
- bioconda::pmdtools=0.60
- bioconda::bedtools=2.30.0
- conda-forge::libiconv=1.16
@@ -48,5 +48,5 @@ dependencies:
- bioconda::bowtie2=2.4.4
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.1
- - bioconda::bbmap=38.90
- - bioconda::bcftools=1.12
+ - bioconda::bbmap=38.92
+ - bioconda::bcftools=1.13
From 93da6eac3a8ccb58b4ad2b307e813d2dc2c0948a Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Tue, 24 Aug 2021 08:22:14 +0200
Subject: [PATCH 53/70] Sync bcftools with samtools version
---
environment.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/environment.yml b/environment.yml
index 4a65d0a7e..7765f5688 100644
--- a/environment.yml
+++ b/environment.yml
@@ -49,4 +49,4 @@ dependencies:
- bioconda::eigenstratdatabasetools=1.0.2
- bioconda::mapdamage2=2.2.1
- bioconda::bbmap=38.92
- - bioconda::bcftools=1.13
+ - bioconda::bcftools=1.12
From 55317f3adf134324efe134204fb4bb02ccbc42f3 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates"
Date: Tue, 24 Aug 2021 20:26:15 +0200
Subject: [PATCH 54/70] Update CHANGELOG to report all updateed dependencies
---
CHANGELOG.md | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 808b7bc0a..124a54a8d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#317](https://github.com/nf-core/eager/issues/317) Added bcftools stats for general genotyping statistics of VCF files
- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
-- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
+- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (suggested by @roberta-davidson)
### `Fixed`
@@ -19,10 +19,35 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Improved output documentation for MultiQC general stats table (thanks to @KathrinNaegele and @esalmela)
- Improved output documentation for BowTie2 (thanks to @isinaltinkaya)
- [#612](https://github.com/nf-core/eager/issues/612) Updated BAM trimming defaults to 0 to ensure no unwanted trimming when mixing half-UDG with no-UDG (thanks to @scarlhoff)
-- [#722](https://github.com/nf-core/eager/issues/722) Updated BWA mapping mapping parameters to latest recommendations - primarily alnn back to 0.01 and aln0 to 2 as per Oliva et al. 2021 (10.1093/bib/bbab076)
+- [#722](https://github.com/nf-core/eager/issues/722) Updated BWA mapping mapping parameters to latest recommendations - primarily alnn back to 0.01 and alno to 2 as per Oliva et al. 2021 (10.1093/bib/bbab076)
### `Dependencies`
+- Bumped python: 3.7.3 -> 3.9.4
+- Bumped markdown: 3.2.2 -> 3.3.4
+- Bumped pymdown-extensions: 7.1 -> 8.2
+- Bumped pyments: 2.6.1 -> 2.9.0
+- Bumped adapterremoval: 2.3.1 -> 2.3.2
+- Bumped picard: 2.22.9 -> 2.26.0
+- Bumped samtools 1.9 -> 1.12
+- Bumped angsd: 0.933 -> 0.935
+- Bumped gatk4: 4.1.7.0 -> 4.2.0.0
+- Bumped multiqc: 1.10.1 -> 1.11
+- Bumped bedtools 2.29.2 -> 2.30.0
+- Bumped libiconv: 1.15 -> 1.16
+- Bumped preseq: 2.0.3 -> 3.1.2
+- Bumped bamutil: 1.0.14 -> 1.0.15
+- Bumped pysam: 0.15.4 -> 0.16.0
+- Bumped kraken2: 2.1.1 -> 2.1.2
+- Bumped pandas: 1.0.4 -> 1.2.4
+- Bumped freebayes: 1.3.2 -> 1.3.5
+- Bumped biopython: 1.76 -> 1.79
+- Bumped xopen: 0.9.0 -> 1.1.0
+- Bumped bowtie2: 2.4.2 -> 2.4.4
+- Bumped mapdamage2: 2.2.0 -> 2.2.1
+- Bumped bbmap: 38.87 -> 38.92
+- Added bcftools: 1.12
+
### `Deprecated`
## v2.3.5 - 2021-06-03
From 8e81bd3341d016133757f5d29f21b1a6aca2270c Mon Sep 17 00:00:00 2001
From: James Fellows Yates
Date: Wed, 25 Aug 2021 20:07:11 +0200
Subject: [PATCH 55/70] Update images for release
---
docs/images/usage/eager2_metromap_complex.png | Bin 529030 -> 547703 bytes
docs/images/usage/eager2_metromap_complex.svg | 178 +++++----
docs/images/usage/eager2_workflow.png | Bin 562575 -> 532531 bytes
docs/images/usage/eager2_workflow.svg | 364 +++++++++++-------
4 files changed, 336 insertions(+), 206 deletions(-)
diff --git a/docs/images/usage/eager2_metromap_complex.png b/docs/images/usage/eager2_metromap_complex.png
index 244bd76ad5724284a313c9320626a3036acdf310..8791e67f106a737aa7399dfefa5f04da23898ada 100644
GIT binary patch
literal 547703
zcmeFZXH=AD*9JOyeWQtu7z+^*Fzifq1ycOi`!~cH0dqvp_gW*1o{`=`(jD#(Gc+mRNEo*smJ!`w$mbw@_
zJ3DSeGh-|5+jn)j%`Npq#zdJgm_ISNzt7*i7dp~u|FX%3xH{Rq^ZoAapI9^g_)oAg
zcTir;qyKnHdnR1+(hCU27he@ozFf&w-{CSeXb`zggnLl#SEYSwyYAw|-`~MKjQjB=$=H8==g%h-`+j)D&Hvxu!S4Cj0@LvS{touv3sn65U*GxvHwASQP3wG@
z<2do%bf;an*z9mK!7uyrv%^6rqyw-xVe5hCO)08M75O3G$|P1SR;B`rvAD4}-=Brr
z_XYX8c>lVWRj02iXunESbTrm+G+XI}%=$au?E@*IcgYOd>Iu9{7w
z!hQ{MdKt6yco%U8$-Ho~bOr`~H*6-m;{fNUZy&vUvEEGlZ37sxCr9e34=KHkuP=F3
z1@k9eF!sA_XlN+DIFgCQSy)))blSFaiN9~Uv$8}{&bKziN8@~16I64}HdcJj^oCmF
z7V6Gq*Jn3tga~#!WnM6@3KtX653^A`A(Pb$gY$5yWL@z(z-3A9v{RZrtNNj*+;<1L
zu#DO9=Iq(VHp93u#rfn%I}hGy{J!?KYn3gNRBes_y)uBS@}Pd`3ndNvm<|!)m8E_O
zk{_EqyM0E=`&S;!!RofHy1pdijyIEfFOmJlVB*tsTYIcn
z?0a79Jbd;p-_>lJ`7Ux%ARkFIGqGdl;ja-x8VE{|Rq#9%@Wdy5~gc4eTe9yV|3t
zC6;IEmwHZ0u!)*C#9xo8ULWmR2+d--ahWx%+f|pJj0nXl7v8gwg>P_b7CA+cBvzNE
z`D}(7u;pU~!#5t1ixv_Ff-z8lPEkKB<}2>;X>){u&m@7Y$kMXB^?sI
zcXws6cvYm_GfZ~8cy*=2WpT7O`1aR#?gxb|btJu+{GaSSsfP|LLkAZuXdaHv&*O;g
zEtPB|h16^%e%F;Hr-`@Q-23_TYdA&2|GIra!*zMK@tJvDG|slmzJI%>oi^94G_7oP
zebsCu|7S-;%w{O#h10SJcr*%2^P(>M)D+rJ5zHh3kLD|KUrLtEDmt1z3ijd_7fabewS4sr!xM<@ymUn4b}6L-}%hD
z3OUu@{8b$yND3@o>OJf-|JATPXH>F#v3Ox5lQdN)&C@%Y+g@@ksf*hz-LNaX
z2Bpm#&1tWTlZ(KXp9oaLe}5*zt+`Ma*koT57vT~du`(0q-Vdj!Zrf?oER#Pr)mv%R
z?M&7!ar?=B@*`uWLG5cBW>uI%sAUCP-dO%{#`0Xdd7X_S|D6wy4_kgZYV14?iuGx~
zl3IG3@irExyN?)HG{%Na6BlNO``_*0sj?!81!7yx3PyS18(HSvE)V*}mnPl|4MzLk
zP*e=qx{Il&H&B!NVO)QxmF%NkhjL1VV8<)rv~x>`-#^+_x##lX^rqp$sR|DJsq&M2
z)&rjlXX>OuMF*M6I90L?Qoe#38^U}1M8@76fA7IS>W^?Os##=XXRnwnV~H3h6uS_M
zKf2C>EcU@3S0DEgz47>NbF2o@VX%wnm`a>$duMLfl6Kquh}&x8&x~qmrN+(+Gv!+t
zMAK?B*W|V&_EhQ+)w#R14M+RRRwTrQ2V*Hq)1>L|EURaAb#;eDR|=T=DyMW?IyhP`Ed_rDktBX{^ZlRE{q%bs=i7@KkpH6-{Jugnq*TQhX=
zI!(#SgYCKI0X&*TUDKuBhXVO(&PGjyb>4lys};7A!zM?t>)!E#kqqggxoko&TF=DS
z_svXUsdB3;%T{%`lf9sjYUb9#SeVkmGX~e@K>nT{7P5?LDh5%=Ac-&byxqovo=9%7
zn1vnc9>`u>nwE6tcPc!&7Gsvz8z4&Vu%h(%%JG`Cq|JqDWiphf{KIbc
ziM4qCBTrqjawdP*I2e)LXTP{D&BRGm>jtU^bg!SY8^;8Ndw^F-E-QYu)KUYgFRZhie|)hN6k
zxqJR+%Ckk{EA!tk+-rkz8^wLR!SN78=f6A%aniuya4sv;5&g}e5h|{NVIp=lD~sb1
z;I{Y#JFVr>-%jEX$XHq)ca@6}cW%lbn;ERP8q4cH@Vb7>?z4ZwUSDlaRc{s{4b(=-
z-nx%R{Oc{%-)56?n2sVgM
zU$zFoL1UK$_yk_lwk#~J*k!p6+=8X%@ckX!pTKGrH4RuRT)K2AZzQ8~s*&K5J<;~v
zM*_^B8CPmvU+@SxQXC$fLY#SbQ4qgzlTn!{n7%wpUx+VucBruu{vKkQ+iB}X%1^I3
z`~7JLX!)W29Ap3yVvcj1qMF@yE{zIGy5Qw-cm%3QU`Mt_)nf-iv4ri$3itS~t)V19
zzl<7;M5?@j_k+11=~Vc!9RxQDcCRSZVJN|V;FX8vIFZbXo9nP5sX5O!?$4J6T@YAk
zdHwn|>|$J6X8&at=bNvLm0~$k-yCy=-4+}SMY{3ewhlc!{Pf+4+(PXhgwM94kRTJii8)(i(T`^wWVA9#zk=pAitoMyyf}D)1bT^uB*%L
z{b0wmnFLI3tsGsmK9RxnkgM^+bYHa{i>`nP&K<~Z)_?c#*Py>{zhe!{y!-W?w{R+8x4hXV)#(Q;OMl{~J{|Yz=?gKh
zY1NMilb01bVbxcK!_SO$iZH6(eH**0Xn)Jg@uig9t_qka9y{2Qu8ky~i}%o)Y?zpR
zR1l*|NeSaKp}v<(6CM&*@l`KRA4hAAm5T_+Qos|u1RRo5M-b93eOkCUR)7E*t8C~?
z@X{%BE<}e}_T^yJ2ji}*i^$A>rF>-6429{$q%9gR&W&32huOu($Hym1a^@;$>h&&@
zyGbGDMM0C_%k;-O@{)+fr@JZO&@}EjmHf2L#6M!)5+r*cp@xxeM!k;?nSBBih7PDI
zLP7#(35K8_PCO;c7y%;W#izii>j4fjE-Ze(TO?Ya6fOjIo?|rFe{?8`P?KA{JSrm`
z=e#%Aob{DW}e*I8*E;u#MBDtbg)-68N*H6r5(h20G+yBZ?Eg$#4aKU)Nc
zL^SzDJM}_L*>z>t<`%!34yv>3!9p($T$Ydm56h=j;+ADtAA6$-hH