From fcb045228f2da59b7e90b9e135bec4d87b125f1e Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Mon, 24 Jan 2022 18:33:00 +0100 Subject: [PATCH 01/35] Post release version bump --- .github/workflows/ci.yml | 4 ++-- CHANGELOG.md | 10 ++++++++++ Dockerfile | 4 ++-- environment.yml | 2 +- nextflow.config | 4 ++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 96b09dc60..21b7ae020 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,13 +37,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:2.4.2 + run: docker build --no-cache . -t nfcore/eager:dev - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:2.4.2 + docker tag nfcore/eager:dev nfcore/eager:dev - name: Install Nextflow env: diff --git a/CHANGELOG.md b/CHANGELOG.md index e479bc5aa..a0b308b4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [dev] + +### `Added` + +### `Fixed` + +### `Dependencies` + +### `Deprecated` + ## [2.4.2] - 2022-01-24 ### `Added` diff --git a/Dockerfile b/Dockerfile index 3cc4ec4ab..376e12937 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.4.2/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.4.3dev/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.4.2 > nf-core-eager-2.4.2.yml \ No newline at end of file +RUN conda env export --name nf-core-eager-2.4.3dev > nf-core-eager-2.4.3dev.yml \ No newline at end of file diff --git a/environment.yml b/environment.yml index 0db40a045..3df1ce155 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.4.2 +name: nf-core-eager-2.4.3dev channels: - conda-forge - bioconda diff --git a/nextflow.config b/nextflow.config index 36c2a0355..282a26b81 100644 --- a/nextflow.config +++ b/nextflow.config @@ -284,7 +284,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:2.4.2' +process.container = 'nfcore/eager:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -414,7 +414,7 @@ manifest { description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.07.1' - version = '2.4.2' + version = '2.4.3dev' } // Function to ensure that resource requirements don't go beyond From b8d8581e01d26e7dc1e0df1a46eb86ee2d5e00e1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 09:08:25 +0100 Subject: [PATCH 02/35] Fix snpCapture bed --- .github/workflows/ci.yml | 4 ++++ CHANGELOG.md | 3 +++ main.nf | 15 ++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21b7ae020..3eca0283c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -201,6 +201,10 @@ jobs: - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq' + - name: SNPCAPTURE Run the basic pipeline with the bam input profile, generating statistics with a SNP capture bed + run: | + wget https://github.com/nf-core/test-datasets/raw/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz && 1240K.pos.list_hs37d5.0based.bed.gz + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --snpcapture_bed 1240K.pos.list_hs37d5.0based.bed - name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --run_sexdeterrmine diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..0d55a2f8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#838] Fix --snpcapture_bed files not being picked up by Nextflow +- Fix PMDtools reference mask not being picked up by Nextflow, and it's use being evaluated against --snpcapture_bed rather than --pmdtools_reference_mask + ### `Dependencies` ### `Deprecated` diff --git a/main.nf b/main.nf index 6f89cd0fa..354d794a6 100644 --- a/main.nf +++ b/main.nf @@ -245,6 +245,13 @@ if ( !params.clip_adapters_list ) { .set {ch_adapterlist} } +if ( params.snpcapture_bed ) { + snpcapture_bed = file(params.snpcapture_bed, checkIfExists: true) +} + +if ( params.pmdtools_reference_mask ) { + pmdtoolsmask = file(params.pmdtools_reference_mask, checkIfExists: true) +} // SexDetermination channel set up and bedfile validation if (!params.sexdeterrmine_bedfile) { @@ -2145,12 +2152,14 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - if(params.snpcapture_bed){ - snpcap = (params.pmdtools_reference_mask) ? "--refseq ${params.pmdtools_reference_mask}" : '' + + if( params.pmdtools_reference_mask ){ + snpcap = (params.pmdtools_reference_mask) ? "--refseq ${pmdtools_reference_mask}" : '' log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } else { snpcap = '' } + def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' """ @@ -2286,7 +2295,7 @@ process qualimap { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") into ch_qualimap_results script: - def snpcap = params.snpcapture_bed ? "-gff ${params.snpcapture_bed}" : '' + def snpcap = params.snpcapture_bed ? "-gff ${snpcapture_bed}" : '' """ qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} --java-mem-size=${task.memory.toGiga()}G """ From 937794295977a2d67cea245ed7389d34c3f41511 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 09:22:15 +0100 Subject: [PATCH 03/35] Remove deprecated parameters from test profiles --- CHANGELOG.md | 2 ++ conf/test_full.config | 1 - conf/test_stresstest_human.config | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..dde3616c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#836](https://github.com/nf-core/eager/issues/836) Remove deprecated parameters from test profiles + ### `Dependencies` ### `Deprecated` diff --git a/conf/test_full.config b/conf/test_full.config index da2827e77..1c3c5c0f9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -21,7 +21,6 @@ params { bwaalnl = 1024 run_bam_filtering = true - bam_discard_unmapped = true bam_unmapped_type = 'discard' bam_mapping_quality_threshold = 25 diff --git a/conf/test_stresstest_human.config b/conf/test_stresstest_human.config index f61d1b64e..3d3ed08a9 100644 --- a/conf/test_stresstest_human.config +++ b/conf/test_stresstest_human.config @@ -24,7 +24,6 @@ params { mtnucratio_header = 'ChrM' run_bam_filtering = true - bam_discard_unmapped = true bam_unmapped_type = 'discard' bam_mapping_quality_threshold = 30 From 307a621cff91c5a627001f4081de6da421e5d323 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 09:32:07 +0100 Subject: [PATCH 04/35] Add step size to lc_extrap docs --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index bbec384d3..743473454 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -806,7 +806,7 @@ "default": 1000, "description": "Specify the step size of Preseq.", "fa_icon": "fas fa-shoe-prints", - "help_text": "Can be used to configure the step size of Preseq's `c_curve` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve parameter: `-s`" + "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" }, "preseq_maxextrap": { "type": "integer", From 01a9cf74671fc733f06b02a2165eaf10a85e7732 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 09:33:51 +0100 Subject: [PATCH 05/35] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..d67f13614 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap + ### `Dependencies` ### `Deprecated` From 99ae64d11346204f5f33a8f722739eed6adc8f57 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 09:53:49 +0100 Subject: [PATCH 06/35] Improve error message --- CHANGELOG.md | 2 ++ main.nf | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..cb31e395f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#828](https://github.com/nf-core/eager/issues/828) Improved error message if required metagenomic screening parameters not set correctly + ### `Dependencies` ### `Deprecated` diff --git a/main.nf b/main.nf index 6f89cd0fa..f3bf03c38 100644 --- a/main.nf +++ b/main.nf @@ -107,12 +107,13 @@ if (params.run_multivcfanalyzer) { } if (params.run_metagenomic_screening) { - if ( params.bam_unmapped_type == "discard" ) { - exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'." + + if ( !params.run_bam_filtering ) { + exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --run_bam_filtering --bam_unmapped_type 'fastq'." } - if (params.bam_unmapped_type != 'fastq' ) { - exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads in FASTQ format. Please supply --bam_unmapped_type 'fastq'. Found parameter: --bam_unmapped_type '${params.bam_unmapped_type}'." + if ( params.bam_unmapped_type != "fastq" ) { + exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'." } if (!params.database) { From 5f2c485c1ea2b87a30a9e78de6abebdd18bebcde Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 10:22:38 +0100 Subject: [PATCH 07/35] Fix CI test for new snpcapture bed test --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3eca0283c..599fd53af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -203,7 +203,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq' - name: SNPCAPTURE Run the basic pipeline with the bam input profile, generating statistics with a SNP capture bed run: | - wget https://github.com/nf-core/test-datasets/raw/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz && 1240K.pos.list_hs37d5.0based.bed.gz + wget https://github.com/nf-core/test-datasets/raw/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz && gunzip 1240K.pos.list_hs37d5.0based.bed.gz nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --snpcapture_bed 1240K.pos.list_hs37d5.0based.bed - name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination run: | From 41695849871a3f267c61f71506c122bb6d07fd53 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 10:52:53 +0100 Subject: [PATCH 08/35] Convert bed inputs to channels --- main.nf | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/main.nf b/main.nf index 354d794a6..816da9d31 100644 --- a/main.nf +++ b/main.nf @@ -246,11 +246,15 @@ if ( !params.clip_adapters_list ) { } if ( params.snpcapture_bed ) { - snpcapture_bed = file(params.snpcapture_bed, checkIfExists: true) + ch_snpcapture_bed = Channel.fromPath(params.snpcapture_bed, checkIfExists: true) +} else { + ch_snpcapture_bed = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") } -if ( params.pmdtools_reference_mask ) { - pmdtoolsmask = file(params.pmdtools_reference_mask, checkIfExists: true) +if ( params.pmdtoolsmask ) { + ch_pmdtoolsmask = Channel.fromPath(params.pmdtoolsmask, checkIfExists: true) +} else { + ch_pmdtoolsmask = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") } // SexDetermination channel set up and bedfile validation @@ -2144,6 +2148,7 @@ process pmdtools { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_pmdtools file fasta from ch_fasta_for_pmdtools.collect() + path pmdtools_reference_mask from ch_pmdtoolsmask output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.pmd.bam"), path("*.pmd.bam.{bai,csi}") into ch_output_from_pmdtools @@ -2152,14 +2157,8 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - - if( params.pmdtools_reference_mask ){ - snpcap = (params.pmdtools_reference_mask) ? "--refseq ${pmdtools_reference_mask}" : '' - log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" - } else { - snpcap = '' - } - + def snpcap = pmdtools_reference_mask.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask}" : '' + if ( !params.pmdtools_reference_mask ) { log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' """ @@ -2290,12 +2289,13 @@ process qualimap { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_addlibmerge_for_qualimap file fasta from ch_fasta_for_qualimap.collect() + path snpcapture_bed from ch_snpcapture_bed output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") into ch_qualimap_results script: - def snpcap = params.snpcapture_bed ? "-gff ${snpcapture_bed}" : '' + def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "-gff ${snpcapture_bed}" : '' """ qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} --java-mem-size=${task.memory.toGiga()}G """ From 42117f4ede9cbc597396c953d785811afa2fecb3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 15:05:27 +0100 Subject: [PATCH 09/35] Revert pmd reference mask check but keep new syntax --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 816da9d31..4af2445a0 100644 --- a/main.nf +++ b/main.nf @@ -251,8 +251,8 @@ if ( params.snpcapture_bed ) { ch_snpcapture_bed = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") } -if ( params.pmdtoolsmask ) { - ch_pmdtoolsmask = Channel.fromPath(params.pmdtoolsmask, checkIfExists: true) +if ( params.pmdtools_reference_mask ) { + ch_pmdtoolsmask = Channel.fromPath(params.pmdtools_reference_mask, checkIfExists: true) } else { ch_pmdtoolsmask = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") } @@ -2157,8 +2157,8 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - def snpcap = pmdtools_reference_mask.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask}" : '' - if ( !params.pmdtools_reference_mask ) { log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } + def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask}" : '' + if ( snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' && !params.pmdtools_reference_mask ) { log.info "[nf-core/eager] warn: No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' """ From 9481c8403198889911cf647287c25ed11b28f4fa Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 10 Mar 2022 15:16:52 +0100 Subject: [PATCH 10/35] Re-add piping, and hope ARFP2 doesn't pipe errors into files anymore (only reported at one cluster) --- CHANGELOG.md | 2 ++ main.nf | 20 ++++++++------------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..665a78695 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#843](https://github.com/nf-core/eager/issues/843) Re-add direct piping of AdapterRemovalFixPrefix to pigz + ### `Dependencies` ### `Deprecated` diff --git a/main.nf b/main.nf index 6f89cd0fa..85c5bc76b 100644 --- a/main.nf +++ b/main.nf @@ -811,8 +811,8 @@ process adapter_removal { mv *.settings output/ ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz + """ //PE mode, collapse and trim, outputting all reads, preserving 5p } else if (seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && !params.mergedonly && params.preserve5p) { @@ -826,8 +826,8 @@ process adapter_removal { mv *.settings output/ ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz + """ // PE mode, collapse and trim but only output collapsed reads } else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && !params.preserve5p ) { @@ -838,8 +838,7 @@ process adapter_removal { cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz mv *.settings output/ """ @@ -852,8 +851,7 @@ process adapter_removal { cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz mv *.settings output/ """ @@ -867,8 +865,7 @@ process adapter_removal { cat *.collapsed.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz mv *.settings output/ """ @@ -882,8 +879,7 @@ process adapter_removal { cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq - pigz -p ${task.cpus - 1} output/${base}.pe.combined.fq + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus - 1} > output/${base}.pe.combined.fq.gz mv *.settings output/ """ From 05533de90221cc8a66b4af46005186031bb84eff Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 11 Mar 2022 07:51:08 +0100 Subject: [PATCH 11/35] Fix the PMD test and add --basecomposition --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4af2445a0..8707e3f66 100644 --- a/main.nf +++ b/main.nf @@ -2148,6 +2148,7 @@ process pmdtools { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_pmdtools file fasta from ch_fasta_for_pmdtools.collect() + path snpcapture_bed from ch_snpcapture_bed path pmdtools_reference_mask from ch_pmdtoolsmask output: @@ -2157,7 +2158,7 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask}" : '' + def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask} --basecomposition" : '' if ( snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' && !params.pmdtools_reference_mask ) { log.info "[nf-core/eager] warn: No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' From 66d732816f99d9a57de891f513a22b0ec9712060 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 11 Mar 2022 08:11:25 +0100 Subject: [PATCH 12/35] Back to DSL1... --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 8707e3f66..5cb45cb9c 100644 --- a/main.nf +++ b/main.nf @@ -246,9 +246,9 @@ if ( !params.clip_adapters_list ) { } if ( params.snpcapture_bed ) { - ch_snpcapture_bed = Channel.fromPath(params.snpcapture_bed, checkIfExists: true) + Channel.fromPath(params.snpcapture_bed, checkIfExists: true).into { ch_snpcapture_bed, ch_snpcapture_bed_pmd } } else { - ch_snpcapture_bed = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") + Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt").into { ch_snpcapture_bed, ch_snpcapture_bed_pmd } } if ( params.pmdtools_reference_mask ) { From ebae736b311cedc1e30a98e6e7fe883b6e7dba59 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 11 Mar 2022 08:15:17 +0100 Subject: [PATCH 13/35] Fix separateor on into --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 5cb45cb9c..0a3f2932b 100644 --- a/main.nf +++ b/main.nf @@ -246,9 +246,9 @@ if ( !params.clip_adapters_list ) { } if ( params.snpcapture_bed ) { - Channel.fromPath(params.snpcapture_bed, checkIfExists: true).into { ch_snpcapture_bed, ch_snpcapture_bed_pmd } + Channel.fromPath(params.snpcapture_bed, checkIfExists: true).into { ch_snpcapture_bed; ch_snpcapture_bed_pmd } } else { - Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt").into { ch_snpcapture_bed, ch_snpcapture_bed_pmd } + Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt").into { ch_snpcapture_bed; ch_snpcapture_bed_pmd } } if ( params.pmdtools_reference_mask ) { @@ -2148,7 +2148,7 @@ process pmdtools { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_pmdtools file fasta from ch_fasta_for_pmdtools.collect() - path snpcapture_bed from ch_snpcapture_bed + path snpcapture_bed from ch_snpcapture_bed_pmd path pmdtools_reference_mask from ch_pmdtoolsmask output: From 5d0f2dd54a70e0a48fdc0cfca810d872577e600e Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 11 Mar 2022 08:18:13 +0100 Subject: [PATCH 14/35] Fix dummy file clash --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 0a3f2932b..7125feb2e 100644 --- a/main.nf +++ b/main.nf @@ -254,7 +254,7 @@ if ( params.snpcapture_bed ) { if ( params.pmdtools_reference_mask ) { ch_pmdtoolsmask = Channel.fromPath(params.pmdtools_reference_mask, checkIfExists: true) } else { - ch_pmdtoolsmask = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") + ch_pmdtoolsmask = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy2.txt") } // SexDetermination channel set up and bedfile validation From 53bf9f3c499690cc546312dacf871234c44dbdb6 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 15 Mar 2022 08:17:03 +0100 Subject: [PATCH 15/35] Update MQC for runtime speed up --- CHANGELOG.md | 2 ++ environment.yml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..68ebc6880 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Dependencies` +- Bumped python: 1.11 -> 1.12 (for run-time optimisation and tool citation information) + ### `Deprecated` ## [2.4.1] - 2021-11-30 diff --git a/environment.yml b/environment.yml index 3df1ce155..03b2c6b4b 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8 - - bioconda::multiqc=1.11 + - bioconda::multiqc=1.12 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 @@ -49,4 +49,4 @@ dependencies: - bioconda::eigenstratdatabasetools=1.0.2 - bioconda::mapdamage2=2.2.1 - bioconda::bbmap=38.92 - - bioconda::bcftools=1.12 \ No newline at end of file + - bioconda::bcftools=1.12 From e5e36a0d380663f6b413b91814f0664eb4882c78 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 15 Mar 2022 14:15:35 +0100 Subject: [PATCH 16/35] Reference masking prior to pmdtools, when requested. --- main.nf | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 6f89cd0fa..a243f21a6 100644 --- a/main.nf +++ b/main.nf @@ -184,7 +184,7 @@ if("${params.fasta}".endsWith(".gz")){ path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path` output: - path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats + path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats script: unzip = zipped_fasta.toString() - '.gz' @@ -195,7 +195,7 @@ if("${params.fasta}".endsWith(".gz")){ } else { fasta_for_indexing = Channel .fromPath("${params.fasta}", checkIfExists: true) - .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling;ch_fasta_for_bcftools_stats } + .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling;ch_fasta_for_bcftools_stats } } // Check that fasta index file path ends in '.fai' @@ -245,6 +245,12 @@ if ( !params.clip_adapters_list ) { .set {ch_adapterlist} } +// Set up channel with pmdtools reference mask bedfile +if (!params.pmdtools_reference_mask) { + ch_bedfile_for_reference_masking = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") +} else { + ch_bedfile_for_reference_masking = Channel.fromPath(params.pmdtools_reference_mask, checkIfExists: true) +} // SexDetermination channel set up and bedfile validation if (!params.sexdeterrmine_bedfile) { @@ -2127,6 +2133,33 @@ process mapdamage_rescaling { // Optionally perform further aDNA evaluation or filtering for just reads with damage etc. +process mask_reference_for_pmdtools { + label 'sc_tiny' + tag "${fasta}" + publishDir "${params.outdir}/reference_genome/masked_reference", mode: params.publish_dir_mode + + when: (params.pmdtools_reference_mask && params.run_pmdtools) + + input: + file fasta from ch_unmasked_fasta_for_masking.collect() + file bedfile from ch_bedfile_for_reference_masking + + output: + file "${fasta.baseName}_masked.fa" into ch_masked_fasta_for_pmdtools + + script: + """ + bedtools maskfasta -fi ${fasta} -bed ${bedfile} -fo ${fasta.baseName}_masked.fa + """ +} + +// If masking was requested, use masked reference for pmdtools, else use original reference +if (params.pmdtools_reference_mask) { + ch_masked_fasta_for_pmdtools.set{ch_fasta_for_pmdtools} +} else { + ch_unmasked_fasta_for_pmdtools.set{ch_fasta_for_pmdtools} +} + process pmdtools { label 'mc_medium' tag "${libraryid}" From dec0534dd27c133d93134f64c9649a7db95af533 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 16 Mar 2022 09:02:30 +0100 Subject: [PATCH 17/35] Try adding UDG treatment to file name --- main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 6f89cd0fa..93b7e9d14 100644 --- a/main.nf +++ b/main.nf @@ -1944,6 +1944,7 @@ ch_input_for_librarymerging.merge_me [it[0], libraryid, it[2], seqtype, it[4], it[5], it[6], bam, bai ] } + .dump(tag: "input_for lib_merging") .set { ch_fixedinput_for_librarymerging } process library_merge { @@ -1960,8 +1961,8 @@ process library_merge { script: def size = params.large_ref ? '-c' : '' """ - samtools merge ${samplename}_libmerged_rmdup.bam ${bam} - samtools index ${samplename}_libmerged_rmdup.bam ${size} + samtools merge ${samplename}_${udg}_libmerged_rmdup.bam ${bam} + samtools index ${samplename}_${udg}_libmerged_rmdup.bam ${size} """ } From 816105259ff99ee491c523ba573230d670d766dc Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 16 Mar 2022 09:02:47 +0100 Subject: [PATCH 18/35] tweak file name for clarity --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 93b7e9d14..9940a8be3 100644 --- a/main.nf +++ b/main.nf @@ -1961,8 +1961,8 @@ process library_merge { script: def size = params.large_ref ? '-c' : '' """ - samtools merge ${samplename}_${udg}_libmerged_rmdup.bam ${bam} - samtools index ${samplename}_${udg}_libmerged_rmdup.bam ${size} + samtools merge ${samplename}_udg${udg}_libmerged_rmdup.bam ${bam} + samtools index ${samplename}_udg${udg}_libmerged_rmdup.bam ${size} """ } From 04df54127311b9017d8c828f50b0fcb61baddc50 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 17 Mar 2022 07:43:52 +0100 Subject: [PATCH 19/35] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..35ed26843 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample + ### `Dependencies` ### `Deprecated` From 66961c5dbde56a672d1162933d75fddf305271c1 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 17 Mar 2022 07:47:22 +0100 Subject: [PATCH 20/35] Update docs --- docs/output.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index f2f4d995a..15afe0e8a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -701,5 +701,6 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * finally, the `*.kraken.out` file are the direct output of Kraken2 * `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) * `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. -* `librarymerged_bams/`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) + `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. +* `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) * `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied. From 80ed69cfaa819bc3be271023edb6ae7fe184cd82 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 17 Mar 2022 11:42:27 +0100 Subject: [PATCH 21/35] Update CHANGELOG --- CHANGELOG.md | 6 ++++-- docs/output.md | 37 +++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68ebc6880..7280386dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- Renamed a range of MultiQC general stats table headers to improve clarity, documentation updated accordingly. + ### `Dependencies` +- Bumped MultiQC: 1.11 -> 1.12 (for run-time optimisation and tool citation information) + ### `Deprecated` ## [2.4.2] - 2022-01-24 @@ -25,8 +29,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Dependencies` -- Bumped python: 1.11 -> 1.12 (for run-time optimisation and tool citation information) - ### `Deprecated` ## [2.4.1] - 2021-11-30 diff --git a/docs/output.md b/docs/output.md index f2f4d995a..71e553f22 100644 --- a/docs/output.md +++ b/docs/output.md @@ -59,36 +59,37 @@ This table will report values per-file, library, or sample statistics depending Each column name is supplied by the module, so you may see similar column names. When unsure, hovering over the column name will allow you see which module it is derived from. -The possible columns displayed by default are as follows: +The possible columns displayed by default are as follows (note you may see additional columns depending on what other modules you activate): * **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. -* **Seqs** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. -* **Length** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. -* **%GC** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. +* **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. +* **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. +* **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. * **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC %GC column. * **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. -* **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. -* **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. -* **Length** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules +* **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. +* **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. +* **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules * **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below). -* **Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. +* **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. * **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias. -* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. +* **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping. +* **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. * **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. -* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) -* **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. +* **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) +* **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. * **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. -* **%Dups** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). +* **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). * **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. -* **Mean Read Length** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -* **Median Read Length** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -* **Aligned** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. +* **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +* **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +* **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. * **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position. * **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. -* **% GC** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. +* **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. * **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. -* **XRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. -* **YRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. +* **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. +* **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. * **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller. * **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. * **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. From 5b5da7eac37ec153f4493ebb7e843fbe88538f6f Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 17 Mar 2022 11:43:32 +0100 Subject: [PATCH 22/35] Fix yml --- assets/multiqc_config.yaml | 72 +++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 82556a9ab..ba9050a05 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -61,43 +61,43 @@ extra_fn_clean_exts: top_modules: - 'fastqc': - name: 'FastQC (pre-Trimming)' - path_filters: - - '*_raw_fastqc.zip' + name: 'FastQC (pre-Trimming)' + path_filters: + - '*_raw_fastqc.zip' - 'fastp' - 'adapterRemoval' - 'fastqc': - name: 'FastQC (post-Trimming)' - path_filters: + name: 'FastQC (post-Trimming)' + path_filters: - '*.truncated_fastqc.zip' - '*.combined*_fastqc.zip' - 'bowtie2': - path_filters: + path_filters: - '*_bt2.log' - 'malt' - 'hops' - 'kraken' - 'samtools': - name: 'Samtools Flagstat (pre-samtools filter)' - path_filters: + name: 'Samtools Flagstat (pre-samtools filter)' + path_filters: - '*_flagstat.stats' - 'samtools': - name: 'Samtools Flagstat (post-samtools filter)' - path_filters: + name: 'Samtools Flagstat (post-samtools filter)' + path_filters: - '*_postfilterflagstat.stats' - 'dedup' - 'picard' - 'preseq': - path_filters: - - '*.preseq' + path_filters: + - '*.preseq' - 'damageprofiler' - 'mtnucratio' - 'qualimap' - 'sexdeterrmine' - 'bcftools' - 'multivcfanalyzer': - path_filters: - - '*MultiVCFAnalyzer.json' + path_filters: + - '*MultiVCFAnalyzer.json' qualimap_config: general_stats_coverage: - 1 @@ -107,7 +107,7 @@ qualimap_config: - 5 remove_sections: - - sexdeterrmine-snps + - sexdeterrmine-snps table_columns_visible: FastQC (pre-Trimming): @@ -272,5 +272,45 @@ report_section_order: order: -1000 nf-core-eager-summary: order: -1001 - export_plots: true +table_columns_name: + FastQC (pre-Trimming): + total_sequences: "Nr. Input Reads" + avg_sequence_length: "Length Input Reads" + percent_gc: "% GC Input Reads" + percent_duplicates: "% Dups Input Reads" + percent_fails: "% Failed Input Reads" + FastQC (post-Trimming): + total_sequences: "Nr. Processed Reads" + avg_sequence_length: "Length Processed Reads" + percent_gc: "% GC Processed Reads" + percent_duplicates: "% Dups Processed Reads" + percent_fails: "%Failed Processed Reads" + Samtools Flagstat (pre-samtools filter): + flagstat_total: "Nr. Reads Into Mapping" + mapped_passed: "Nr. Mapped Reads" + Samtools Flagstat (post-samtools filter): + flagstat_total: "Nr. Mapped Reads Post-Filter" + mapped_passed: "Nr. Mapped Reads Passed Post-Filter" + Endogenous DNA Post (%): + endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)" + Picard: + PERCENT_DUPLICATION: "% Dup. Mapped Reads" + DamageProfiler: + mean_readlength: "Mean Length Mapped Reads" + median_readlength: "Median Length Mapped Reads" + QualiMap: + mapped_reads: "Nr. Dedup. Mapped Reads" + total_reads: "Nr. Dedup. Total Reads" + avg_gc: "% GC Dedup. Mapped Reads" + Bcftools Stats: + number_of_records: "Nr. Overall Variants" + number_of_SNPs: "Nr. SNPs" + number_of_indels: "Nr. InDels" + MALT: + Mappability: "% Metagenomic Mappability" + SexDetErrmine: + RateErrX: "SexDet Err X Chr" + RateErrY: "SexDet Err Y Chr" + RateX: "SexDet Rate X Chr" + RateY: "SexDet Rate Y Chr" \ No newline at end of file From d1f92f1b41fa4503411e22ca24c0bfb33c4ba4c0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 14:08:03 +0100 Subject: [PATCH 23/35] Remove superfluous collect --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index a243f21a6..6ce427596 100644 --- a/main.nf +++ b/main.nf @@ -2141,7 +2141,7 @@ process mask_reference_for_pmdtools { when: (params.pmdtools_reference_mask && params.run_pmdtools) input: - file fasta from ch_unmasked_fasta_for_masking.collect() + file fasta from ch_unmasked_fasta_for_masking file bedfile from ch_bedfile_for_reference_masking output: From bbed8ca498f25804306e563cba1854d1c078e2c0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 14:09:52 +0100 Subject: [PATCH 24/35] Update sequencetools to 1.5.2 --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 3df1ce155..0d83fc2b8 100644 --- a/environment.yml +++ b/environment.yml @@ -31,7 +31,7 @@ dependencies: - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 - conda-forge::pigz=2.6 - - bioconda::sequencetools=1.4.0.6 + - bioconda::sequencetools=1.5.2 - bioconda::preseq=3.1.2 - bioconda::fastp=0.20.1 - bioconda::bamutil=1.0.15 From c8a05ab9053d210285fe45e2b035b8f4b3687a89 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 14:59:38 +0100 Subject: [PATCH 25/35] Updated option --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index bbec384d3..6802de91a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -889,9 +889,9 @@ }, "pmdtools_reference_mask": { "type": "string", - "description": "Specify a path to reference mask for PMDTools.", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", "fa_icon": "fas fa-mask", - "help_text": "Can be used to set a path to a reference genome mask for PMDTools." + "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." }, "pmdtools_max_reads": { "type": "integer", From f36ca8d700ed0ceba79a5d162dd4608a2dfafce1 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 15:17:33 +0100 Subject: [PATCH 26/35] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0b308b4c..dd507f4ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Fixed` +- [#844](https://github.com/nf-core/eager/issues/844) Fixed reference masking prior to pmdtools. + ### `Dependencies` +- [#829](https://github.com/nf-core/eager/issues/829) Bumped sequencetools: 1.4.0.5 -> 1.5.2 + ### `Deprecated` ## [2.4.2] - 2022-01-24 From 339c8049c716cb8b17b3f0f0201d7b5cb4fbcf5f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 15:17:58 +0100 Subject: [PATCH 27/35] Add mention of reference_genome/masked_genome directory --- docs/output.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/output.md b/docs/output.md index f2f4d995a..458c32478 100644 --- a/docs/output.md +++ b/docs/output.md @@ -674,6 +674,7 @@ This section gives a brief summary of where to look for what files for downstrea Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. * `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. + * When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. * `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. * `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. * `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. From 883dd54d3d42edc9bad039a571e911430c021961 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 17 Mar 2022 16:04:50 +0100 Subject: [PATCH 28/35] Remove unnecessary variables for pmdtools process. --- main.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index f0871b325..11a611583 100644 --- a/main.nf +++ b/main.nf @@ -2154,6 +2154,7 @@ process mask_reference_for_pmdtools { file "${fasta.baseName}_masked.fa" into ch_masked_fasta_for_pmdtools script: + log.info "[nf-core/eager]: Masking reference \'${fasta}\' at positions found in \'${bedfile}\'. Masked reference will be used for pmdtools." """ bedtools maskfasta -fi ${fasta} -bed ${bedfile} -fo ${fasta.baseName}_masked.fa """ @@ -2176,8 +2177,6 @@ process pmdtools { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_pmdtools file fasta from ch_fasta_for_pmdtools.collect() - path snpcapture_bed from ch_snpcapture_bed_pmd - path pmdtools_reference_mask from ch_pmdtoolsmask output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.pmd.bam"), path("*.pmd.bam.{bai,csi}") into ch_output_from_pmdtools @@ -2186,18 +2185,16 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask} --basecomposition" : '' - if ( snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' && !params.pmdtools_reference_mask ) { log.info "[nf-core/eager] warn: No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' """ #Run Filtering step - samtools calmd ${bam} ${fasta} | pmdtools --threshold ${params.pmdtools_threshold} ${treatment} ${snpcap} --header | samtools view -Sb - > "${libraryid}".pmd.bam + samtools calmd ${bam} ${fasta} | pmdtools --threshold ${params.pmdtools_threshold} ${treatment} --header | samtools view -Sb - > "${libraryid}".pmd.bam #Run Calc Range step ## To allow early shut off of pipe: https://github.com/nextflow-io/nextflow/issues/1564 trap 'if [[ \$? == 141 ]]; then echo "Shutting samtools early due to -n parameter" && samtools index ${libraryid}.pmd.bam ${size}; exit 0; fi' EXIT - samtools calmd ${bam} ${fasta} | pmdtools --deamination ${platypus} --range ${params.pmdtools_range} ${treatment} ${snpcap} -n ${params.pmdtools_max_reads} > "${libraryid}".cpg.range."${params.pmdtools_range}".txt + samtools calmd ${bam} ${fasta} | pmdtools --deamination ${platypus} --range ${params.pmdtools_range} ${treatment} -n ${params.pmdtools_max_reads} > "${libraryid}".cpg.range."${params.pmdtools_range}".txt samtools index ${libraryid}.pmd.bam ${size} """ From 4633e4a2bcc8445c9ed89a30004bff8e28e09115 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 17 Mar 2022 16:31:51 +0100 Subject: [PATCH 29/35] Update main.nf Co-authored-by: Thiseas C. Lamnidis --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9940a8be3..c3e26a7e0 100644 --- a/main.nf +++ b/main.nf @@ -1944,7 +1944,7 @@ ch_input_for_librarymerging.merge_me [it[0], libraryid, it[2], seqtype, it[4], it[5], it[6], bam, bai ] } - .dump(tag: "input_for lib_merging") + .dump(tag: "input_for_lib_merging") .set { ch_fixedinput_for_librarymerging } process library_merge { From ec266bd18c6f668ac61b6da372d799e7eccc1bb3 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 17 Mar 2022 16:56:03 +0100 Subject: [PATCH 30/35] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cbe83e4e..3b6fb3d80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#844](https://github.com/nf-core/eager/issues/844) Fixed reference masking prior to pmdtools - [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap - [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample -- Fix PMDtools reference mask not being picked up by Nextflow, and it's use being evaluated against --snpcapture_bed rather than --pmdtools_reference_mask - Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly ### `Dependencies` From c1b8df3f04512c583369a4851300c0c56b95749e Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 18 Mar 2022 08:32:14 +0100 Subject: [PATCH 31/35] Remove expected `.txt` suffix for snp coverage --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 8e3db7bca..2a43ba715 100644 --- a/main.nf +++ b/main.nf @@ -2549,11 +2549,11 @@ process eigenstrat_snp_coverage { /* The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json + eigenstrat_snp_coverage -i pileupcaller.${strandedness} >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json """ */ """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt + eigenstrat_snp_coverage -i pileupcaller.${strandedness} >${strandedness}_eigenstrat_coverage.txt parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt """ } From 7efae629bd3349e2586d3553f62566fb862b9a00 Mon Sep 17 00:00:00 2001 From: marcel-keller <61977721+marcel-keller@users.noreply.github.com> Date: Tue, 22 Mar 2022 14:16:43 +0100 Subject: [PATCH 32/35] Corrected the --bamutils_clip flags Corrected the --bamutils_clip flags in the tutorials according to parameters tab. I did not check other flags, neither did a test run of the tutorials. --- docs/usage.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index ad427be05..454b10a93 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1272,8 +1272,8 @@ nextflow run nf-core/eager \ --bam_mapping_quality_threshold 25 \ --bam_unmapped_type 'discard' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ <...> ``` @@ -1342,8 +1342,8 @@ nextflow run nf-core/eager \ --bam_mapping_quality_threshold 25 \ --bam_unmapped_type 'discard' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ --run_sexdeterrmine \ --sexdeterrmine_bedfile '../Reference/genome/1240k.sites.bed' \ --run_nuclear_contamination \ @@ -1383,8 +1383,8 @@ nextflow run nf-core/eager \ --bam_mapping_quality_threshold 25 \ --bam_unmapped_type 'discard' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ --run_sexdeterrmine \ --sexdeterrmine_bedfile '../Reference/genome/1240k.sites.bed' \ --run_nuclear_contamination \ @@ -1425,8 +1425,8 @@ nextflow run nf-core/eager \ --bam_mapping_quality_threshold 25 \ --bam_unmapped_type 'discard' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ --run_sexdeterrmine \ --sexdeterrmine_bedfile '../Reference/genome/1240k.sites.bed' \ --run_nuclear_contamination \ @@ -2397,10 +2397,10 @@ nextflow run nf-core/eager \ --run_bedtools_coverage \ --anno_file '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.gff' --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ ---bamutils_clip_none_udg_left 3 \ ---bamutils_clip_none_udg_right 3 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ +--bamutils_clip_double_stranded_none_udg_left 3 \ +--bamutils_clip_double_stranded_none_udg_right 3 \ <...> ``` @@ -2436,10 +2436,10 @@ nextflow run nf-core/eager \ --bam_unmapped_type 'discard' \ --dedupper 'markduplicates' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ ---bamutils_clip_none_udg_left 3 \ ---bamutils_clip_none_udg_right 3 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ +--bamutils_clip_double_stranded_none_udg_left 3 \ +--bamutils_clip_double_stranded_none_udg_right 3 \ --run_bedtools_coverage \ --anno_file '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.gff' \ --run_genotyping \ @@ -2479,10 +2479,10 @@ nextflow run nf-core/eager \ --bam_unmapped_type 'discard' \ --dedupper 'markduplicates' \ --run_trim_bam \ ---bamutils_clip_half_udg_left 2 \ ---bamutils_clip_half_udg_right 2 \ ---bamutils_clip_none_udg_left 3 \ ---bamutils_clip_none_udg_right 3 \ +--bamutils_clip_double_stranded_half_udg_left 2 \ +--bamutils_clip_double_stranded_half_udg_right 2 \ +--bamutils_clip_double_stranded_none_udg_left 3 \ +--bamutils_clip_double_stranded_none_udg_right 3 \ --run_bedtools_coverage \ --anno_file '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.gff' \ --run_genotyping \ From bf7e101785d1bf8c507e38351291a1965681abaf Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 22 Mar 2022 14:25:25 +0100 Subject: [PATCH 33/35] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b6fb3d80..4ac54982c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#828](https://github.com/nf-core/eager/issues/828) Improved error message if required metagenomic screening parameters not set correctly - [#836](https://github.com/nf-core/eager/issues/836) Remove deprecated parameters from test profiles -- [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow +- [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow (thanks to @meganemichel for reporting) - [#843](https://github.com/nf-core/eager/issues/843) Re-add direct piping of AdapterRemovalFixPrefix to pigz - [#844](https://github.com/nf-core/eager/issues/844) Fixed reference masking prior to pmdtools - [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap - [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample - Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly +- [#858](https://github.com/nf-core/eager/pull/858) Corrected tutorials to reflect updated BAM trimming flags (thanks to @marcel-keller for reporting) ### `Dependencies` From 26a0503ed980b30ac6ac115bd6a08fa4a5941bf4 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 13:10:08 +0100 Subject: [PATCH 34/35] Correct `-n` with `-N` in samtools fastq to retain paired information in BAM2FASTQ fiels when paired-end mapping --- CHANGELOG.md | 5 +++-- main.nf | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ac54982c..73a475102 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,13 +11,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#828](https://github.com/nf-core/eager/issues/828) Improved error message if required metagenomic screening parameters not set correctly - [#836](https://github.com/nf-core/eager/issues/836) Remove deprecated parameters from test profiles -- [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow (thanks to @meganemichel for reporting) +- [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow (❤ to @meganemichel for reporting) - [#843](https://github.com/nf-core/eager/issues/843) Re-add direct piping of AdapterRemovalFixPrefix to pigz - [#844](https://github.com/nf-core/eager/issues/844) Fixed reference masking prior to pmdtools - [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap - [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample - Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly -- [#858](https://github.com/nf-core/eager/pull/858) Corrected tutorials to reflect updated BAM trimming flags (thanks to @marcel-keller for reporting) +- [#857](https://github.com/nf-core/eager/issues/857) Corrected samtools fastq flag to retain read-pair information when converting off-target BAM files to fastq (❤ to @alexhbnr for reporting) +- [#858](https://github.com/nf-core/eager/pull/858) Corrected tutorials to reflect updated BAM trimming flags (❤ to @marcel-keller for reporting) ### `Dependencies` diff --git a/main.nf b/main.nf index 2a43ba715..afd26d4db 100644 --- a/main.nf +++ b/main.nf @@ -638,7 +638,7 @@ process convertBam { script: base = "${bam.baseName}" """ - samtools fastq -tn ${bam} | pigz -p ${task.cpus} > ${base}.converted.fastq.gz + samtools fastq -t ${bam} | pigz -p ${task.cpus} > ${base}.converted.fastq.gz """ } @@ -1683,7 +1683,7 @@ process samtools_filter { samtools index ${libraryid}.filtered.bam ${size} ## FASTQ - samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus - 1} > ${libraryid}.unmapped.fastq.gz + samtools fastq -tN ${libraryid}.unmapped.bam | pigz -p ${task.cpus - 1} > ${libraryid}.unmapped.fastq.gz rm ${libraryid}.unmapped.bam """ } else if ( "${params.bam_unmapped_type}" == "both" && params.bam_filter_minreadlength == 0 ){ @@ -1693,7 +1693,7 @@ process samtools_filter { samtools index ${libraryid}.filtered.bam ${size} ## FASTQ - samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus -1} > ${libraryid}.unmapped.fastq.gz + samtools fastq -tN ${libraryid}.unmapped.bam | pigz -p ${task.cpus -1} > ${libraryid}.unmapped.fastq.gz """ // Unmapped/MAPQ Filtering WITH min-length filtering } else if ( "${params.bam_unmapped_type}" == "keep" && params.bam_filter_minreadlength != 0 ) { @@ -1723,7 +1723,7 @@ process samtools_filter { samtools index ${libraryid}.filtered.bam ${size} ## FASTQ - samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus - 1} > ${libraryid}.unmapped.fastq.gz + samtools fastq -tN ${libraryid}.unmapped.bam | pigz -p ${task.cpus - 1} > ${libraryid}.unmapped.fastq.gz rm ${libraryid}.unmapped.bam """ } else if ( "${params.bam_unmapped_type}" == "both" && params.bam_filter_minreadlength != 0 ){ @@ -1734,7 +1734,7 @@ process samtools_filter { samtools index ${libraryid}.filtered.bam ${size} ## FASTQ - samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz + samtools fastq -tN ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz """ } } From 61f6559ed21737fa662cc7adbc9c276e86f3f1d3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 13:10:20 +0100 Subject: [PATCH 35/35] Update Changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73a475102..612a91836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap - [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample - Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly -- [#857](https://github.com/nf-core/eager/issues/857) Corrected samtools fastq flag to retain read-pair information when converting off-target BAM files to fastq (❤ to @alexhbnr for reporting) +- [#857](https://github.com/nf-core/eager/issues/857) Corrected samtools fastq flag to _retain_ read-pair information when converting off-target BAM files to fastq in paired-end mapping (❤ to @alexhbnr for reporting) - [#858](https://github.com/nf-core/eager/pull/858) Corrected tutorials to reflect updated BAM trimming flags (❤ to @marcel-keller for reporting) ### `Dependencies`