From 199a02bccf173b99c14a08f0da0bb5286b04d0dc Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 29 Jul 2022 08:02:59 +0200 Subject: [PATCH 01/15] Pin to later version of node as in https://github.com/nf-core/tools/pull/1398 --- .github/workflows/linting.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 83a8bc100..ca1bc6194 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -12,9 +12,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/setup-node@v2 + - name: Install markdownlint run: npm install -g markdownlint-cli - name: Run Markdownlint From 2997adc0f3822a2780621994d032c20219076ab3 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 29 Jul 2022 08:06:55 +0200 Subject: [PATCH 02/15] Update linting.yml --- .github/workflows/linting.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index ca1bc6194..77b4b9d07 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -50,9 +50,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/setup-node@v2 + - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint From c6ea1174367dd801c60d7b7f5b14ffdadceef364 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 29 Jul 2022 08:09:41 +0200 Subject: [PATCH 03/15] Update .nf-core-lint.yml --- .nf-core-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.nf-core-lint.yml b/.nf-core-lint.yml index 496fea360..a39889277 100644 --- a/.nf-core-lint.yml +++ b/.nf-core-lint.yml @@ -3,4 +3,4 @@ files_unchanged: - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.md - docs/README.md - + - .github/workflows/linting.yml From 99046cb0faa2d0bd9c5affe8190e98b483d245ba Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:13:34 +0200 Subject: [PATCH 04/15] prettier fixes --- .github/CONTRIBUTING.md | 60 +- .github/ISSUE_TEMPLATE/bug_report.md | 3 +- .github/PULL_REQUEST_TEMPLATE.md | 6 +- .../pull_request_template.md | 20 +- .github/markdownlint.yml | 14 +- .github/workflows/awsfulltest.yml | 2 - .github/workflows/awstest.yml | 2 - .github/workflows/branch.yml | 2 - .github/workflows/ci.yml | 16 +- .github/workflows/linting.yml | 8 +- .github/workflows/linting_comment.yml | 2 - README.md | 221 +- assets/angsd_resources/README | 15 +- assets/email_template.html | 167 +- assets/multiqc_config.yaml | 592 +-- docs/README.md | 27 +- docs/output.md | 296 +- docs/usage.md | 338 +- nextflow_schema.json | 3416 ++++++++--------- 19 files changed, 2587 insertions(+), 2620 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 75b61b9ff..fc6028ac7 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -16,7 +16,7 @@ Contributions to the code are even more welcome ;) If you'd like to write some code for nf-core/eager, the standard workflow is as follows: 1. Check that there isn't already an issue about your idea in the [nf-core/eager issues](https://github.com/nf-core/eager/issues) to avoid duplicating work - * If there isn't one already, please create one so that others know you're working on this + - If there isn't one already, please create one so that others know you're working on this 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/eager repository](https://github.com/nf-core/eager) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build .` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). @@ -49,9 +49,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -* On your own fork, make a new branch `patch` based on `upstream/master`. -* Fix the bug, and bump version (X.Y.Z+1). -* A PR should be made on `master` from patch to directly this particular bug. +- On your own fork, make a new branch `patch` based on `upstream/master`. +- Fix the bug, and bump version (X.Y.Z+1). +- A PR should be made on `master` from patch to directly this particular bug. ## Getting help @@ -96,9 +96,9 @@ The process resources can be passed on to the tool dynamically within the proces Please use the following naming schemes, to make it easy to understand what is going where. -* initial process channel: `ch_output_from_` -* intermediate and terminal channels: `ch__for_` -* skipped process output: `ch__for_`(this goes out of the bypass statement described above) +- initial process channel: `ch_output_from_` +- intermediate and terminal channels: `ch__for_` +- skipped process output: `ch__for_`(this goes out of the bypass statement described above) ### Nextflow version bumping @@ -135,18 +135,18 @@ For all internal nf-core/eager documentation images we are using the 'Kalam' fon We are providing a highly configurable pipeline, with many options to turn on and off different processes in different combinations. This can make a very complex graph structure that can cause a large amount of duplicated channels coming out of every process to account for each possible combination. -The EAGER pipeline can currently be broken down into the following 'stages', where a stage is a collection of non-terminal mutually exclusive processes, which is the output of which is used for another file reporting module (but not reporting!) . +The EAGER pipeline can currently be broken down into the following 'stages', where a stage is a collection of non-terminal mutually exclusive processes, which is the output of which is used for another file reporting module (but not reporting!) . -* Input -* Convert BAM -* PolyG Clipping -* AdapterRemoval -* Mapping (either `bwa`, `bwamem`, or `circularmapper`) -* BAM Filtering -* Deduplication (either `dedup` or `markduplicates`) -* BAM Trimming -* PMDtools -* Genotyping +- Input +- Convert BAM +- PolyG Clipping +- AdapterRemoval +- Mapping (either `bwa`, `bwamem`, or `circularmapper`) +- BAM Filtering +- Deduplication (either `dedup` or `markduplicates`) +- BAM Trimming +- PMDtools +- Genotyping Every step can potentially be skipped, therefore the output of a previous stage must be able to be passed to the next stage, if the given stage is not run. @@ -154,16 +154,16 @@ To somewhat simplify this logic, we have implemented the following structure. The concept is as follows: -* Every 'stage' of the pipeline (i.e. collection of mutually exclusive processes) must always have a if else statement following it. -* This if else 'bypass' statement collects and standardises all possible input files into single channel(s) for the next stage. -* Importantly - within the bypass statement, a channel from the previous stage's bypass mixes into these output channels. This additional channel is named `ch_previousstage_for_skipcurrentstage`. This contains the output from the previous stage, i.e. not the modified version from the current stage. -* The bypass statement works as follows: - * If the current stage is turned on: will mix the previous stage and current stage output and filter for file suffixes unique to the current stage output - * If the current stage is turned off or skipped: will mix the previous stage and current stage output. However as there there is no files in the output channel from the current stage, no filtering is required and the files in the 'ch_XXX_for_skipXXX' stage will be used. - - This ensures the same channel inputs to the next stage is 'homogeneous' - i.e. all comes from the same source (the bypass statement) - - An example schematic can be given as follows +- Every 'stage' of the pipeline (i.e. collection of mutually exclusive processes) must always have a if else statement following it. +- This if else 'bypass' statement collects and standardises all possible input files into single channel(s) for the next stage. +- Importantly - within the bypass statement, a channel from the previous stage's bypass mixes into these output channels. This additional channel is named `ch_previousstage_for_skipcurrentstage`. This contains the output from the previous stage, i.e. not the modified version from the current stage. +- The bypass statement works as follows: + - If the current stage is turned on: will mix the previous stage and current stage output and filter for file suffixes unique to the current stage output + - If the current stage is turned off or skipped: will mix the previous stage and current stage output. However as there there is no files in the output channel from the current stage, no filtering is required and the files in the 'ch_XXX_for_skipXXX' stage will be used. + +This ensures the same channel inputs to the next stage is 'homogeneous' - i.e. all comes from the same source (the bypass statement) + +An example schematic can be given as follows ```nextflow // PREVIOUS STAGE OUTPUT @@ -191,7 +191,7 @@ process fastp { script: """ - echo "I have been fastp'd" > ${fq} + echo "I have been fastp'd" > ${fq} mv ${fq} ${fq}.pG.fq """ } @@ -206,4 +206,4 @@ if (params.run_fastp) { .into { ch_fastp_for_adapterremoval; ch_fastp_for_skipadapterremoval } } - ``` +``` diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b461caca3..596e363ee 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -18,8 +18,7 @@ Please delete this text and anything that's not relevant from the template below I have checked the following places for your error: - [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) -- [ ] [nf-core/eager pipeline documentation](https://nf-co.re/nf-core/eager/usage) - - nf-core/eager FAQ/troubleshooting can be found [here](https://nf-co.re/eager/usage#troubleshooting-and-faqs) +- [ ] [nf-core/eager pipeline documentation](https://nf-co.re/nf-core/eager/usage) - nf-core/eager FAQ/troubleshooting can be found [here](https://nf-co.re/eager/usage#troubleshooting-and-faqs) ## Description of the bug diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 864af6938..6f09b12b6 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -16,9 +16,9 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/eage - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint .`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md index 959f01ca4..80e155437 100644 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -4,15 +4,15 @@ Please fill in the appropriate checklist below (delete whatever is not relevant) ## PR checklist - - [ ] This comment contains a description of changes (with reason). - - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If necessary, also make a PR on the [nf-core/eager branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/eager). - - [ ] Make sure your code lints (`nf-core lint .`). - - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - - [ ] Usage Documentation in `docs/usage.md` is updated. - - [ ] Output Documentation in `docs/output.md` is updated. - - [ ] `CHANGELOG.md` is updated. - - [ ] `README.md` is updated (including new tool citations and authors/contributors). +- [ ] This comment contains a description of changes (with reason). +- [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If necessary, also make a PR on the [nf-core/eager branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/eager). +- [ ] Make sure your code lints (`nf-core lint .`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Usage Documentation in `docs/usage.md` is updated. +- [ ] Output Documentation in `docs/output.md` is updated. +- [ ] `CHANGELOG.md` is updated. +- [ ] `README.md` is updated (including new tool citations and authors/contributors). **Learn more about contributing:** https://github.com/nf-core/eager/tree/master/.github/CONTRIBUTING.md diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index 8d7eb53b0..24989492d 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -2,11 +2,11 @@ default: true line-length: false no-duplicate-header: - siblings_only: true + siblings_only: true no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary + allowed_elements: + - img + - p + - kbd + - details + - summary diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 4e03e75be..c12dc70a6 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -9,7 +9,6 @@ on: types: [completed] workflow_dispatch: - env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -18,7 +17,6 @@ env: AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} - jobs: run-awstest: name: Run AWS full tests diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 6e0a9538c..e889eed9e 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -6,7 +6,6 @@ name: nf-core AWS test on: workflow_dispatch: - env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -15,7 +14,6 @@ env: AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} - jobs: run-awstest: name: Run AWS tests diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 909b52d6b..5b39a40ed 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -15,7 +15,6 @@ jobs: run: | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/eager ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] - # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment @@ -43,4 +42,3 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8977cd31e..c2ab62933 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.07.1', ''] + nxf_ver: ["20.07.1", ""] steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -58,7 +58,7 @@ jobs: run: | git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data - name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs - run: | + run: | if [[ $NXF_VER = '' ]]; then sleep 1200; fi - name: BASIC Run the basic pipeline with directly supplied single-end FASTQ run: | @@ -74,7 +74,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --save_reference - name: REFERENCE Basic workflow, with supplied indices run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' - name: REFERENCE Run the basic pipeline with FastA reference with `fna` extension run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker @@ -107,7 +107,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' - name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_post_ar_trimming @@ -193,11 +193,11 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter - name: MALTEXTRACT Download resource files run: | - mkdir -p databases/maltextract - for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done + mkdir -p databases/maltextract + for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done - name: MALTEXTRACT Basic with MALT plus MaltExtract run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq' @@ -216,4 +216,4 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' \ No newline at end of file + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77b4b9d07..771dfd721 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -45,7 +45,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - YAML: runs-on: ubuntu-latest steps: @@ -82,11 +81,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - nf-core: runs-on: ubuntu-latest steps: - - name: Check out pipeline code uses: actions/checkout@v2 @@ -99,8 +96,8 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.6' - architecture: 'x64' + python-version: "3.6" + architecture: "x64" - name: Install dependencies run: | @@ -127,4 +124,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 90f03c6f9..0471addcc 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,4 +1,3 @@ - name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork @@ -26,4 +25,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md - diff --git a/README.md b/README.md index 1b4ff5b36..4103766da 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ ## Introduction + **nf-core/eager** is a scalable and reproducible bioinformatics best-practise processing pipeline for genomic NGS sequencing data, with a focus on ancient DNA (aDNA) data. It is ideal for the (palaeo)genomic analysis of humans, animals, plants, microbes and even microbiomes. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The pipeline pre-processes raw data from FASTQ inputs, or preprocessed BAM inputs. It can align reads and performs extensive general NGS and aDNA specific quality-control on the results. It comes with docker, singularity or conda containers making installation trivial and results highly reproducible. @@ -34,23 +35,23 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 3. Download the pipeline and test it on a minimal dataset with a single command: - ```bash - nextflow run nf-core/eager -profile test, - ``` + ```bash + nextflow run nf-core/eager -profile test, + ``` - > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. 4. Start running your own analysis! - ```bash - nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' - ``` + ```bash + nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' + ``` 5. Once your run has completed successfully, clean up the intermediate files. - ```bash - nextflow clean -f -k - ``` + ```bash + nextflow clean -f -k + ``` See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. @@ -64,16 +65,16 @@ Modifications to the default pipeline are easily made using various options as d By default the pipeline currently performs the following: -* Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) -* Sequencing quality control (`FastQC`) -* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`) -* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) -* Post-mapping processing, statistics and conversion to bam (`samtools`) -* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) -* PCR duplicate removal (`DeDup` or `MarkDuplicates`) -* Post-mapping statistics and BAM quality control (`Qualimap`) -* Library Complexity Estimation (`preseq`) -* Overall pipeline statistics summaries (`MultiQC`) +- Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) +- Sequencing quality control (`FastQC`) +- Sequencing adapter removal, paired-end data merging (`AdapterRemoval`) +- Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) +- Post-mapping processing, statistics and conversion to bam (`samtools`) +- Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) +- PCR duplicate removal (`DeDup` or `MarkDuplicates`) +- Post-mapping statistics and BAM quality control (`Qualimap`) +- Library Complexity Estimation (`preseq`) +- Overall pipeline statistics summaries (`MultiQC`) ### Additional Steps @@ -81,40 +82,40 @@ Additional functionality contained by the pipeline currently includes: #### Input -* Automatic merging of complex sequencing setups (e.g. multiple lanes, sequencing configurations, library types) +- Automatic merging of complex sequencing setups (e.g. multiple lanes, sequencing configurations, library types) #### Preprocessing -* Illumina two-coloured sequencer poly-G tail removal (`fastp`) -* Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`) -* Automatic conversion of unmapped reads to FASTQ (`samtools`) -* Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples) +- Illumina two-coloured sequencer poly-G tail removal (`fastp`) +- Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`) +- Automatic conversion of unmapped reads to FASTQ (`samtools`) +- Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples) #### aDNA Damage manipulation -* Damage removal/clipping for UDG+/UDG-half treatment protocols (`BamUtil`) -* Damaged reads extraction and assessment (`PMDTools`) -* Nuclear DNA contamination estimation of human samples (`angsd`) +- Damage removal/clipping for UDG+/UDG-half treatment protocols (`BamUtil`) +- Damaged reads extraction and assessment (`PMDTools`) +- Nuclear DNA contamination estimation of human samples (`angsd`) #### Genotyping -* Creation of VCF genotyping files (`GATK UnifiedGenotyper`, `GATK HaplotypeCaller` and `FreeBayes`) -* Creation of EIGENSTRAT genotyping files (`pileupCaller`) -* Creation of Genotype Likelihood files (`angsd`) -* Consensus sequence FASTA creation (`VCF2Genome`) -* SNP Table generation (`MultiVCFAnalyzer`) +- Creation of VCF genotyping files (`GATK UnifiedGenotyper`, `GATK HaplotypeCaller` and `FreeBayes`) +- Creation of EIGENSTRAT genotyping files (`pileupCaller`) +- Creation of Genotype Likelihood files (`angsd`) +- Consensus sequence FASTA creation (`VCF2Genome`) +- SNP Table generation (`MultiVCFAnalyzer`) #### Biological Information -* Mitochondrial to Nuclear read ratio calculation (`MtNucRatioCalculator`) -* Statistical sex determination of human individuals (`Sex.DetERRmine`) +- Mitochondrial to Nuclear read ratio calculation (`MtNucRatioCalculator`) +- Statistical sex determination of human individuals (`Sex.DetERRmine`) #### Metagenomic Screening -* Low-sequenced complexity filtering (`BBduk`) -* Taxonomic binner with alignment (`MALT`) -* Taxonomic binner without alignment (`Kraken2`) -* aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) +- Low-sequenced complexity filtering (`BBduk`) +- Taxonomic binner with alignment (`MALT`) +- Taxonomic binner without alignment (`Kraken2`) +- aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) #### Functionality Overview @@ -130,11 +131,11 @@ The nf-core/eager pipeline comes with documentation about the pipeline: [usage]( 1. [Nextflow installation](https://nf-co.re/usage/installation) 2. Pipeline configuration - * [Pipeline installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) + - [Pipeline installation](https://nf-co.re/usage/local_installation) + - [Adding your own system config](https://nf-co.re/usage/adding_own_config) + - [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](https://nf-co.re/eager/docs/usage.md) - * This includes tutorials, FAQs, and troubleshooting instructions + - This includes tutorials, FAQs, and troubleshooting instructions 4. [Output and how to interpret the results](https://nf-co.re/eager/docs/output.md) ## Credits @@ -146,43 +147,43 @@ of this pipeline: ## Authors (alphabetical) -* [Aida Andrades Valtueña](https://github.com/aidaanva) -* [Alexander Peltzer](https://github.com/apeltzer) -* [James A. Fellows Yates](https://github.com/jfy133) -* [Judith Neukamm](https://github.com/JudithNeukamm) -* [Maxime Borry](https://github.com/maxibor) -* [Maxime Garcia](https://github.com/MaxUlysse) -* [Stephen Clayton](https://github.com/sc13-bioinf) -* [Thiseas C. Lamnidis](https://github.com/TCLamnidis) -* [Zandra Fagernäs](https://github.com/ZandraFagernas) +- [Aida Andrades Valtueña](https://github.com/aidaanva) +- [Alexander Peltzer](https://github.com/apeltzer) +- [James A. Fellows Yates](https://github.com/jfy133) +- [Judith Neukamm](https://github.com/JudithNeukamm) +- [Maxime Borry](https://github.com/maxibor) +- [Maxime Garcia](https://github.com/MaxUlysse) +- [Stephen Clayton](https://github.com/sc13-bioinf) +- [Thiseas C. Lamnidis](https://github.com/TCLamnidis) +- [Zandra Fagernäs](https://github.com/ZandraFagernas) ## Additional Contributors (alphabetical) Those who have provided conceptual guidance, suggestions, bug reports etc. -* [Alexandre Gilardet](https://github.com/alexandregilardet) -* Arielle Munters -* [Åshild Vågene](https://github.com/ashildv) -* [Charles Plessy](https://github.com/charles-plessy) -* [Elina Salmela](https://github.com/esalmela) -* [Hester van Schalkwyk](https://github.com/hesterjvs) -* [Ido Bar](https://github.com/IdoBar) -* [Irina Velsko](https://github.com/ivelsko) -* [Işın Altınkaya](https://github.com/isinaltinkaya) -* [Johan Nylander](https://github.com/nylander) -* [Katerine Eaton](https://github.com/ktmeaton) -* [Kathrin Nägele](https://github.com/KathrinNaegele) -* [Luc Venturini](https://github.com/lucventurini) -* [Marcel Keller](https://github.com/marcel-keller) -* [Megan Michel](https://github.com/meganemichel) -* [Pierre Lindenbaum](https://github.com/lindenb) -* [Pontus Skoglund](https://github.com/pontussk) -* [Raphael Eisenhofer](https://github.com/EisenRa) -* [Roberta Davidson](https://github.com/roberta-davidson) -* [Torsten Günter](https://bitbucket.org/tguenther/) -* [Kevin Lord](https://github.com/lordkev) -* [He Yu](https://github.com/paulayu) -* [Selina Carlhoff](https://github.com/scarlhoff) +- [Alexandre Gilardet](https://github.com/alexandregilardet) +- Arielle Munters +- [Åshild Vågene](https://github.com/ashildv) +- [Charles Plessy](https://github.com/charles-plessy) +- [Elina Salmela](https://github.com/esalmela) +- [Hester van Schalkwyk](https://github.com/hesterjvs) +- [Ido Bar](https://github.com/IdoBar) +- [Irina Velsko](https://github.com/ivelsko) +- [Işın Altınkaya](https://github.com/isinaltinkaya) +- [Johan Nylander](https://github.com/nylander) +- [Katerine Eaton](https://github.com/ktmeaton) +- [Kathrin Nägele](https://github.com/KathrinNaegele) +- [Luc Venturini](https://github.com/lucventurini) +- [Marcel Keller](https://github.com/marcel-keller) +- [Megan Michel](https://github.com/meganemichel) +- [Pierre Lindenbaum](https://github.com/lindenb) +- [Pontus Skoglund](https://github.com/pontussk) +- [Raphael Eisenhofer](https://github.com/EisenRa) +- [Roberta Davidson](https://github.com/roberta-davidson) +- [Torsten Günter](https://bitbucket.org/tguenther/) +- [Kevin Lord](https://github.com/lordkev) +- [He Yu](https://github.com/paulayu) +- [Selina Carlhoff](https://github.com/scarlhoff) If you've contributed and you're missing in here, please let us know and we will add you in of course! @@ -210,43 +211,43 @@ You can cite the `nf-core` publication as follows: In addition, references of tools and data used in this pipeline are as follows: -* **EAGER v1**, CircularMapper, DeDup* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z). Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) -* **FastQC** Download: [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -* **AdapterRemoval v2** Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. [https://doi.org/10.1186/s13104-016-1900-2](https://doi.org/10.1186/s13104-016-1900-2). Download: [https://github.com/MikkelSchubert/adapterremoval](https://github.com/MikkelSchubert/adapterremoval) -* **bwa** Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. [https://doi.org/10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324). Download: [http://bio-bwa.sourceforge.net/bwa.shtml](http://bio-bwa.sourceforge.net/bwa.shtml) -* **SAMtools** Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352). Download: [http://www.htslib.org/](http://www.htslib.org/) -* **DamageProfiler** Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). [https://doi.org/10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: [https://github.com/Integrative-Transcriptomics/DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler) -* **QualiMap** Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. [https://doi.org/10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566). Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) -* **preseq** Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. [https://doi.org/10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375). Download: [http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) -* **PMDTools** Skoglund, P., Northoff, B. H., Shunkov, M. V., Derevianko, A. P., Pääbo, S., Krause, J., & Jakobsson, M. (2014). Separating endogenous ancient DNA from modern day contamination in a Siberian Neandertal. Proceedings of the National Academy of Sciences of the United States of America, 111(6), 2229–2234. [https://doi.org/10.1073/pnas.1318934111](https://doi.org/10.1073/pnas.1318934111). Download: [https://github.com/pontussk/PMDtools](https://github.com/pontussk/PMDtools) -* **MultiQC** Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. [https://doi.org/10.1093/bioinformatics/btw354](https://doi.org/10.1093/bioinformatics/btw354). Download: [https://multiqc.info/](https://multiqc.info/) -* **BamUtils** Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. [https://doi.org/10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114). Download: [https://genome.sph.umich.edu/wiki/BamUtil](https://genome.sph.umich.edu/wiki/BamUtil) -* **FastP** Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. [https://doi.org/10.1093/bioinformatics/bty560](https://doi.org/10.1093/bioinformatics/bty560). Download: [https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) -* **GATK 3.5** DePristo, M. A., Banks, E., Poplin, R., Garimella, K. V., Maguire, J. R., Hartl, C., … Daly, M. J. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics, 43(5), 491–498. [https://doi.org/10.1038/ng.806](https://doi.org/10.1038/ng.806.).Download: [https://console.cloud.google.com/storage/browser/gatk](https://console.cloud.google.com/storage/browser/gatk) -* **GATK 4.X** - no citation available yet. Download: [https://github.com/broadinstitute/gatk/releases](https://github.com/broadinstitute/gatk/releases) -* **VCF2Genome** - Alexander Herbig and Alex Peltzer (unpublished). Download: [https://github.com/apeltzer/VCF2Genome](https://github.com/apeltzer/VCF2Genome) -* **MultiVCFAnalyzer** Bos, K.I. et al., 2014. Pre-Columbian mycobacterial genomes reveal seals as a source of New World human tuberculosis. Nature, 514(7523), pp.494–497. Available at: [http://dx.doi.org/10.1038/nature13591](http://dx.doi.org/10.1038/nature13591). Download: [https://github.com/alexherbig/MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) -* **MTNucRatioCalculator** Alex Peltzter (Unpublished). Download: [https://github.com/apeltzer/MTNucRatioCalculator](https://github.com/apeltzer/MTNucRatioCalculator) -* **Sex.DetERRmine.py** Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: [http://dx.doi.org/10.1038/s41467-018-07483-5](http://dx.doi.org/10.1038/s41467-018-07483-5). Download: [https://github.com/TCLamnidis/Sex.DetERRmine.git](https://github.com/TCLamnidis/Sex.DetERRmine.git) -* **ANGSD** Korneliussen, T.S., Albrechtsen, A. & Nielsen, R., 2014. ANGSD: Analysis of Next Generation Sequencing Data. BMC bioinformatics, 15, p.356. Available at: [http://dx.doi.org/10.1186/s12859-014-0356-4](http://dx.doi.org/10.1186/s12859-014-0356-4). Download: [https://github.com/ANGSD/angsd](https://github.com/ANGSD/angsd) -* **bedtools** Quinlan, A.R. & Hall, I.M., 2010. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics , 26(6), pp.841–842. Available at: [http://dx.doi.org/10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033). Download: [https://github.com/arq5x/bedtools2/releases](https://github.com/arq5x/bedtools2/) -* **MALT**. Download: [https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html) - * Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). - * Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). -* **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) -* **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) -* **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) -* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). -* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) -* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) -* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) -* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) +- **EAGER v1**, CircularMapper, DeDup\* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z). Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) +- **FastQC** Download: [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- **AdapterRemoval v2** Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. [https://doi.org/10.1186/s13104-016-1900-2](https://doi.org/10.1186/s13104-016-1900-2). Download: [https://github.com/MikkelSchubert/adapterremoval](https://github.com/MikkelSchubert/adapterremoval) +- **bwa** Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. [https://doi.org/10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324). Download: [http://bio-bwa.sourceforge.net/bwa.shtml](http://bio-bwa.sourceforge.net/bwa.shtml) +- **SAMtools** Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352). Download: [http://www.htslib.org/](http://www.htslib.org/) +- **DamageProfiler** Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). [https://doi.org/10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: [https://github.com/Integrative-Transcriptomics/DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler) +- **QualiMap** Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. [https://doi.org/10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566). Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) +- **preseq** Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. [https://doi.org/10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375). Download: [http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) +- **PMDTools** Skoglund, P., Northoff, B. H., Shunkov, M. V., Derevianko, A. P., Pääbo, S., Krause, J., & Jakobsson, M. (2014). Separating endogenous ancient DNA from modern day contamination in a Siberian Neandertal. Proceedings of the National Academy of Sciences of the United States of America, 111(6), 2229–2234. [https://doi.org/10.1073/pnas.1318934111](https://doi.org/10.1073/pnas.1318934111). Download: [https://github.com/pontussk/PMDtools](https://github.com/pontussk/PMDtools) +- **MultiQC** Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. [https://doi.org/10.1093/bioinformatics/btw354](https://doi.org/10.1093/bioinformatics/btw354). Download: [https://multiqc.info/](https://multiqc.info/) +- **BamUtils** Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. [https://doi.org/10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114). Download: [https://genome.sph.umich.edu/wiki/BamUtil](https://genome.sph.umich.edu/wiki/BamUtil) +- **FastP** Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. [https://doi.org/10.1093/bioinformatics/bty560](https://doi.org/10.1093/bioinformatics/bty560). Download: [https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) +- **GATK 3.5** DePristo, M. A., Banks, E., Poplin, R., Garimella, K. V., Maguire, J. R., Hartl, C., … Daly, M. J. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics, 43(5), 491–498. [https://doi.org/10.1038/ng.806](https://doi.org/10.1038/ng.806.).Download: [https://console.cloud.google.com/storage/browser/gatk](https://console.cloud.google.com/storage/browser/gatk) +- **GATK 4.X** - no citation available yet. Download: [https://github.com/broadinstitute/gatk/releases](https://github.com/broadinstitute/gatk/releases) +- **VCF2Genome** - Alexander Herbig and Alex Peltzer (unpublished). Download: [https://github.com/apeltzer/VCF2Genome](https://github.com/apeltzer/VCF2Genome) +- **MultiVCFAnalyzer** Bos, K.I. et al., 2014. Pre-Columbian mycobacterial genomes reveal seals as a source of New World human tuberculosis. Nature, 514(7523), pp.494–497. Available at: [http://dx.doi.org/10.1038/nature13591](http://dx.doi.org/10.1038/nature13591). Download: [https://github.com/alexherbig/MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) +- **MTNucRatioCalculator** Alex Peltzter (Unpublished). Download: [https://github.com/apeltzer/MTNucRatioCalculator](https://github.com/apeltzer/MTNucRatioCalculator) +- **Sex.DetERRmine.py** Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: [http://dx.doi.org/10.1038/s41467-018-07483-5](http://dx.doi.org/10.1038/s41467-018-07483-5). Download: [https://github.com/TCLamnidis/Sex.DetERRmine.git](https://github.com/TCLamnidis/Sex.DetERRmine.git) +- **ANGSD** Korneliussen, T.S., Albrechtsen, A. & Nielsen, R., 2014. ANGSD: Analysis of Next Generation Sequencing Data. BMC bioinformatics, 15, p.356. Available at: [http://dx.doi.org/10.1186/s12859-014-0356-4](http://dx.doi.org/10.1186/s12859-014-0356-4). Download: [https://github.com/ANGSD/angsd](https://github.com/ANGSD/angsd) +- **bedtools** Quinlan, A.R. & Hall, I.M., 2010. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics , 26(6), pp.841–842. Available at: [http://dx.doi.org/10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033). Download: [https://github.com/arq5x/bedtools2/releases](https://github.com/arq5x/bedtools2/) +- **MALT**. Download: [https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html) + - Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). + - Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). +- **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) +- **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) +- **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) +- **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). +- **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) +- **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) +- **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) +- **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) ## Data References This repository uses test data from the following studies: -* Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1). -* Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257). -* Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114). -* de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2) +- Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1). +- Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257). +- Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114). +- de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2) diff --git a/assets/angsd_resources/README b/assets/angsd_resources/README index 49cfd6c06..0f2b8c018 100644 --- a/assets/angsd_resources/README +++ b/assets/angsd_resources/README @@ -7,9 +7,8 @@ wget http://hapmap.ncbi.nlm.nih.gov/downloads/frequencies/2010-08_phaseII+III/al wget http://hapmap.ncbi.nlm.nih.gov/downloads/frequencies/2010-08_phaseII+III/allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz #with the md5sum -a105316eaa2ebbdb3f8d62a9cb10a2d5 allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz -5a0f920951ce2ded4afe2f10227110ac allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz - +a105316eaa2ebbdb3f8d62a9cb10a2d5 allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz +5a0f920951ce2ded4afe2f10227110ac allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz ##create dummy bed file to use the liftover tools gunzip -c allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz| awk '{print $2" "$3-1" "$3" "$11" "$12" "$4" "$14}'|sed 1d >allele.txt @@ -18,8 +17,7 @@ gunzip -c allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz| awk '{print $2" "$3-1" "$ liftOver allele.txt /opt/liftover/hg18ToHg19.over.chain.gz hit nohit ##now remove invarible sites, and redundant columns -cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchrX.gz - +cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchrX.gz ##create dummy bed file to use the liftover tools gunzip -c allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt| awk '{print $2" "$3-1" "$3" "$11" "$12" "$4" "$14}'|sed 1d >allele.txt @@ -28,15 +26,14 @@ gunzip -c allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt| awk '{print $2" "$3-1" "$3" liftOver allele.txt /opt/liftover/hg18ToHg19.over.chain.gz hit nohit ##now remove invarible sites, and redundant columns -cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchr21.gz - +cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchr21.gz ####### ##download 100kmer mappability wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign100mer.bigWig #md5sum -a1b1a8c99431fedf6a3b4baef028cca4 wgEncodeCrgMapabilityAlign100mer.bigWig +a1b1a8c99431fedf6a3b4baef028cca4 wgEncodeCrgMapabilityAlign100mer.bigWig ##download convert program wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph @@ -45,6 +42,6 @@ wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph ./bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig chrX -chrom=chrX ./bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig chr21 -chrom=chr21 -##only keep unique regions and discard the chr* column +##only keep unique regions and discard the chr\* column grep -P "\t1$" chr21 |cut -f2-3 |gzip -c >chr21.unique.gz grep -P "\t1$" chrX |cut -f2-3 |gzip -c >chrX.unique.gz diff --git a/assets/email_template.html b/assets/email_template.html index 36bfc9c8d..1e4a996f1 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,53 +1,134 @@ - - - - + + + + - - Codestin Search App - - -
+ + Codestin Search App + + +
+ - +

nf-core/eager v${version}

+

Run Name: $runName

-

nf-core/eager v${version}

-

Run Name: $runName

- -<% if (!success){ - out << """ -
-

nf-core/eager execution completed unsuccessfully!

-

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

+ <% if (!success){ out << """ +
+

+ nf-core/eager execution completed unsuccessfully! +

+

+ The exit status of the task that caused the workflow execution to fail + was: $exitStatus. +

The full error message was:

-
${errorReport}
-
- """ -} else { - out << """ -
+
+${errorReport}
+
+ """ } else { out << """ +
nf-core/eager execution completed successfully! -
- """ -} -%> +
+ """ } %> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
$commandLine
+

+ The workflow was completed at $dateComplete (duration: + $duration) +

+

The command used to launch the workflow was as follows:

+
+$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> "" }.join("\n") %> - -
$k
$v
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> " + + + + + " }.join("\n") %> + +
+ $k + +
$v
+
-

nf-core/eager

-

https://github.com/nf-core/eager

- -
- - +

nf-core/eager

+

+ https://github.com/nf-core/eager +

+
+ diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index ba9050a05..354485c2a 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,316 +1,316 @@ -custom_logo: 'nf-core_eager_logo_outline_drop.png' +custom_logo: "nf-core_eager_logo_outline_drop.png" custom_logo_url: https://github.com/nf-core/eager/ -custom_logo_title: 'nf-core/eager' +custom_logo_title: "nf-core/eager" report_comment: > - This report has been generated by the nf-core/eager - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/eager + analysis pipeline. For information about how to interpret these results, please see the + documentation. run_modules: - - adapterRemoval - - bowtie2 - - custom_content - - damageprofiler - - dedup - - fastp - - fastqc - - gatk - - kraken - - malt - - mtnucratio - - multivcfanalyzer - - picard - - preseq - - qualimap - - samtools - - sexdeterrmine - - hops - - bcftools + - adapterRemoval + - bowtie2 + - custom_content + - damageprofiler + - dedup + - fastp + - fastqc + - gatk + - kraken + - malt + - mtnucratio + - multivcfanalyzer + - picard + - preseq + - qualimap + - samtools + - sexdeterrmine + - hops + - bcftools extra_fn_clean_exts: - - '_fastp' - - '.pe.settings' - - '.se.settings' - - '.settings' - - '.pe.combined' - - '.se.truncated' - - '.mapped' - - '.mapped_rmdup' - - '.mapped_rmdup_stats' - - '_libmerged_rg_rmdup' - - '_libmerged_rg_rmdup_stats' - - '_postfilterflagstat.stats' - - '_flagstat.stat' - - '.filtered' - - '.filtered_rmdup' - - '.filtered_rmdup_stats' - - '_libmerged_rg_add' - - '_libmerged_rg_add_stats' - - '_rmdup' - - '.unmapped' - - '.fastq.gz' - - '.fastq' - - '.fq.gz' - - '.fq' - - '.bam' - - '.kreport' - - '.unifiedgenotyper' - - '.trimmed_stats' - - '_libmerged' - - '_bt2' + - "_fastp" + - ".pe.settings" + - ".se.settings" + - ".settings" + - ".pe.combined" + - ".se.truncated" + - ".mapped" + - ".mapped_rmdup" + - ".mapped_rmdup_stats" + - "_libmerged_rg_rmdup" + - "_libmerged_rg_rmdup_stats" + - "_postfilterflagstat.stats" + - "_flagstat.stat" + - ".filtered" + - ".filtered_rmdup" + - ".filtered_rmdup_stats" + - "_libmerged_rg_add" + - "_libmerged_rg_add_stats" + - "_rmdup" + - ".unmapped" + - ".fastq.gz" + - ".fastq" + - ".fq.gz" + - ".fq" + - ".bam" + - ".kreport" + - ".unifiedgenotyper" + - ".trimmed_stats" + - "_libmerged" + - "_bt2" top_modules: - - 'fastqc': - name: 'FastQC (pre-Trimming)' - path_filters: - - '*_raw_fastqc.zip' - - 'fastp' - - 'adapterRemoval' - - 'fastqc': - name: 'FastQC (post-Trimming)' - path_filters: - - '*.truncated_fastqc.zip' - - '*.combined*_fastqc.zip' - - 'bowtie2': - path_filters: - - '*_bt2.log' - - 'malt' - - 'hops' - - 'kraken' - - 'samtools': - name: 'Samtools Flagstat (pre-samtools filter)' - path_filters: - - '*_flagstat.stats' - - 'samtools': - name: 'Samtools Flagstat (post-samtools filter)' - path_filters: - - '*_postfilterflagstat.stats' - - 'dedup' - - 'picard' - - 'preseq': - path_filters: - - '*.preseq' - - 'damageprofiler' - - 'mtnucratio' - - 'qualimap' - - 'sexdeterrmine' - - 'bcftools' - - 'multivcfanalyzer': - path_filters: - - '*MultiVCFAnalyzer.json' + - "fastqc": + name: "FastQC (pre-Trimming)" + path_filters: + - "*_raw_fastqc.zip" + - "fastp" + - "adapterRemoval" + - "fastqc": + name: "FastQC (post-Trimming)" + path_filters: + - "*.truncated_fastqc.zip" + - "*.combined*_fastqc.zip" + - "bowtie2": + path_filters: + - "*_bt2.log" + - "malt" + - "hops" + - "kraken" + - "samtools": + name: "Samtools Flagstat (pre-samtools filter)" + path_filters: + - "*_flagstat.stats" + - "samtools": + name: "Samtools Flagstat (post-samtools filter)" + path_filters: + - "*_postfilterflagstat.stats" + - "dedup" + - "picard" + - "preseq": + path_filters: + - "*.preseq" + - "damageprofiler" + - "mtnucratio" + - "qualimap" + - "sexdeterrmine" + - "bcftools" + - "multivcfanalyzer": + path_filters: + - "*MultiVCFAnalyzer.json" qualimap_config: - general_stats_coverage: - - 1 - - 2 - - 3 - - 4 - - 5 + general_stats_coverage: + - 1 + - 2 + - 3 + - 4 + - 5 remove_sections: - - sexdeterrmine-snps + - sexdeterrmine-snps table_columns_visible: - FastQC (pre-Trimming): - percent_duplicates: False - percent_gc: True - avg_sequence_length: True - fastp: - pct_duplication: False - after_filtering_gc_content: True - pct_surviving: False - Adapter Removal: - aligned_total: False - percent_aligned: True - FastQC (post-Trimming): - avg_sequence_length: True - percent_duplicates: False - total_sequences: True - percent_gc: True - bowtie2: - overall_alignment_rate: True - MALT: - Taxonomic assignment success: False - Assig. Taxonomy: False - Mappability: True - Total reads: False - Num. of queries: False - Kraken: - '% Unclassified': True - '% Top 5': False - Samtools Flagstat (pre-samtools filter): - flagstat_total: True - mapped_passed: True - Samtools Flagstat (post-samtools filter): - mapped_passed: True - DeDup: - dup_rate: False - clusterfactor: True - mapped_after_dedup: True - Picard: - PERCENT_DUPLICATION: True - DamageProfiler: - 5 Prime1: True - 5 Prime2: True - 3 Prime1: False - 3 Prime2: False - mean_readlength: True - median: True - mtnucratio: - mt_nuc_ratio: True - QualiMap: - mapped_reads: True - mean_coverage: True - 1_x_pc: True - 5_x_pc: True - percentage_aligned: False - median_insert_size: False - MultiVCFAnalyzer: - Heterozygous SNP alleles (percent): True - endorSpy: - endogenous_dna: True - endogenous_dna_post: True - nuclear_contamination: - Num_SNPs: True - Method1_MOM_estimate: False - Method1_MOM_SE: False - Method1_ML_estimate: True - Method1_ML_SE: True - Method2_MOM_estimate: False - Method2_MOM_SE: False - Method2_ML_estimate: False - Method2_ML_SE: False - snp_coverage: - Covered_Snps: True - Total_Snps: False + FastQC (pre-Trimming): + percent_duplicates: False + percent_gc: True + avg_sequence_length: True + fastp: + pct_duplication: False + after_filtering_gc_content: True + pct_surviving: False + Adapter Removal: + aligned_total: False + percent_aligned: True + FastQC (post-Trimming): + avg_sequence_length: True + percent_duplicates: False + total_sequences: True + percent_gc: True + bowtie2: + overall_alignment_rate: True + MALT: + Taxonomic assignment success: False + Assig. Taxonomy: False + Mappability: True + Total reads: False + Num. of queries: False + Kraken: + "% Unclassified": True + "% Top 5": False + Samtools Flagstat (pre-samtools filter): + flagstat_total: True + mapped_passed: True + Samtools Flagstat (post-samtools filter): + mapped_passed: True + DeDup: + dup_rate: False + clusterfactor: True + mapped_after_dedup: True + Picard: + PERCENT_DUPLICATION: True + DamageProfiler: + 5 Prime1: True + 5 Prime2: True + 3 Prime1: False + 3 Prime2: False + mean_readlength: True + median: True + mtnucratio: + mt_nuc_ratio: True + QualiMap: + mapped_reads: True + mean_coverage: True + 1_x_pc: True + 5_x_pc: True + percentage_aligned: False + median_insert_size: False + MultiVCFAnalyzer: + Heterozygous SNP alleles (percent): True + endorSpy: + endogenous_dna: True + endogenous_dna_post: True + nuclear_contamination: + Num_SNPs: True + Method1_MOM_estimate: False + Method1_MOM_SE: False + Method1_ML_estimate: True + Method1_ML_SE: True + Method2_MOM_estimate: False + Method2_MOM_SE: False + Method2_ML_estimate: False + Method2_ML_SE: False + snp_coverage: + Covered_Snps: True + Total_Snps: False table_columns_placement: - FastQC (pre-Trimming): - total_sequences: 100 - avg_sequence_length: 110 - percent_gc: 120 - fastp: - after_filtering_gc_content: 200 - Adapter Removal: - percent_aligned: 300 - FastQC (post-Trimming): - total_sequences: 400 - avg_sequence_length: 410 - percent_gc: 420 - Bowtie 2 / HiSAT2: - overall_alignment_rate: 450 - MALT: - Num. of queries: 430 - Total reads: 440 - Mappability: 450 - Assig. Taxonomy: 460 - Taxonomic assignment success: 470 - Kraken: - '% Unclassified': 480 - Samtools Flagstat (pre-samtools filter): - flagstat_total: 551 - mapped_passed: 552 - Samtools Flagstat (post-samtools filter): - flagstat_total: 600 - mapped_passed: 620 - endorSpy: - endogenous_dna: 610 - endogenous_dna_post: 640 - nuclear_contamination: - Num_SNPs: 1100 - Method1_MOM_estimate: 1110 - Method1_MOM_SE: 1120 - Method1_ML_estimate: 1130 - Method1_ML_SE: 1140 - Method2_MOM_estimate: 1150 - Method2_MOM_SE: 1160 - Method2_ML_estimate: 1170 - Method2_ML_SE: 1180 - snp_coverage: - Covered_Snps: 1050 - Total_Snps: 1060 - DeDup: - mapped_after_dedup: 620 - clusterfactor: 630 - Picard: - PERCENT_DUPLICATION: 650 - DamageProfiler: - 5 Prime1: 700 - 5 Prime2: 710 - 3 Prime1: 720 - 3 Prime2: 730 - mean_readlength: 740 - median: 750 - mtnucratio: - mtreads: 760 - mt_cov_avg: 770 - mt_nuc_ratio: 780 - QualiMap: - mapped_reads: 800 - mean_coverage: 805 - median_coverage: 810 - 1_x_pc: 820 - 2_x_pc: 830 - 3_x_pc: 840 - 4_x_pc: 850 - 5_x_pc: 860 - avg_gc: 870 - sexdeterrmine: - RateX: 1000 - RateY: 1010 - MultiVCFAnalyzer: - Heterozygous SNP alleles (percent): 1200 + FastQC (pre-Trimming): + total_sequences: 100 + avg_sequence_length: 110 + percent_gc: 120 + fastp: + after_filtering_gc_content: 200 + Adapter Removal: + percent_aligned: 300 + FastQC (post-Trimming): + total_sequences: 400 + avg_sequence_length: 410 + percent_gc: 420 + Bowtie 2 / HiSAT2: + overall_alignment_rate: 450 + MALT: + Num. of queries: 430 + Total reads: 440 + Mappability: 450 + Assig. Taxonomy: 460 + Taxonomic assignment success: 470 + Kraken: + "% Unclassified": 480 + Samtools Flagstat (pre-samtools filter): + flagstat_total: 551 + mapped_passed: 552 + Samtools Flagstat (post-samtools filter): + flagstat_total: 600 + mapped_passed: 620 + endorSpy: + endogenous_dna: 610 + endogenous_dna_post: 640 + nuclear_contamination: + Num_SNPs: 1100 + Method1_MOM_estimate: 1110 + Method1_MOM_SE: 1120 + Method1_ML_estimate: 1130 + Method1_ML_SE: 1140 + Method2_MOM_estimate: 1150 + Method2_MOM_SE: 1160 + Method2_ML_estimate: 1170 + Method2_ML_SE: 1180 + snp_coverage: + Covered_Snps: 1050 + Total_Snps: 1060 + DeDup: + mapped_after_dedup: 620 + clusterfactor: 630 + Picard: + PERCENT_DUPLICATION: 650 + DamageProfiler: + 5 Prime1: 700 + 5 Prime2: 710 + 3 Prime1: 720 + 3 Prime2: 730 + mean_readlength: 740 + median: 750 + mtnucratio: + mtreads: 760 + mt_cov_avg: 770 + mt_nuc_ratio: 780 + QualiMap: + mapped_reads: 800 + mean_coverage: 805 + median_coverage: 810 + 1_x_pc: 820 + 2_x_pc: 830 + 3_x_pc: 840 + 4_x_pc: 850 + 5_x_pc: 860 + avg_gc: 870 + sexdeterrmine: + RateX: 1000 + RateY: 1010 + MultiVCFAnalyzer: + Heterozygous SNP alleles (percent): 1200 read_count_multiplier: 1 -read_count_prefix: '' -read_count_desc: '' -ancient_read_count_prefix: '' -ancient_read_count_desc: '' +read_count_prefix: "" +read_count_desc: "" +ancient_read_count_prefix: "" +ancient_read_count_desc: "" ancient_read_count_multiplier: 1 -decimalPoint_format: '.' -thousandsSep_format: ',' +decimalPoint_format: "." +thousandsSep_format: "," report_section_order: - software_versions: - order: -1000 - nf-core-eager-summary: - order: -1001 + software_versions: + order: -1000 + nf-core-eager-summary: + order: -1001 export_plots: true table_columns_name: - FastQC (pre-Trimming): - total_sequences: "Nr. Input Reads" - avg_sequence_length: "Length Input Reads" - percent_gc: "% GC Input Reads" - percent_duplicates: "% Dups Input Reads" - percent_fails: "% Failed Input Reads" - FastQC (post-Trimming): - total_sequences: "Nr. Processed Reads" - avg_sequence_length: "Length Processed Reads" - percent_gc: "% GC Processed Reads" - percent_duplicates: "% Dups Processed Reads" - percent_fails: "%Failed Processed Reads" - Samtools Flagstat (pre-samtools filter): - flagstat_total: "Nr. Reads Into Mapping" - mapped_passed: "Nr. Mapped Reads" - Samtools Flagstat (post-samtools filter): - flagstat_total: "Nr. Mapped Reads Post-Filter" - mapped_passed: "Nr. Mapped Reads Passed Post-Filter" - Endogenous DNA Post (%): - endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)" - Picard: - PERCENT_DUPLICATION: "% Dup. Mapped Reads" - DamageProfiler: - mean_readlength: "Mean Length Mapped Reads" - median_readlength: "Median Length Mapped Reads" - QualiMap: - mapped_reads: "Nr. Dedup. Mapped Reads" - total_reads: "Nr. Dedup. Total Reads" - avg_gc: "% GC Dedup. Mapped Reads" - Bcftools Stats: - number_of_records: "Nr. Overall Variants" - number_of_SNPs: "Nr. SNPs" - number_of_indels: "Nr. InDels" - MALT: - Mappability: "% Metagenomic Mappability" - SexDetErrmine: - RateErrX: "SexDet Err X Chr" - RateErrY: "SexDet Err Y Chr" - RateX: "SexDet Rate X Chr" - RateY: "SexDet Rate Y Chr" \ No newline at end of file + FastQC (pre-Trimming): + total_sequences: "Nr. Input Reads" + avg_sequence_length: "Length Input Reads" + percent_gc: "% GC Input Reads" + percent_duplicates: "% Dups Input Reads" + percent_fails: "% Failed Input Reads" + FastQC (post-Trimming): + total_sequences: "Nr. Processed Reads" + avg_sequence_length: "Length Processed Reads" + percent_gc: "% GC Processed Reads" + percent_duplicates: "% Dups Processed Reads" + percent_fails: "%Failed Processed Reads" + Samtools Flagstat (pre-samtools filter): + flagstat_total: "Nr. Reads Into Mapping" + mapped_passed: "Nr. Mapped Reads" + Samtools Flagstat (post-samtools filter): + flagstat_total: "Nr. Mapped Reads Post-Filter" + mapped_passed: "Nr. Mapped Reads Passed Post-Filter" + Endogenous DNA Post (%): + endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)" + Picard: + PERCENT_DUPLICATION: "% Dup. Mapped Reads" + DamageProfiler: + mean_readlength: "Mean Length Mapped Reads" + median_readlength: "Median Length Mapped Reads" + QualiMap: + mapped_reads: "Nr. Dedup. Mapped Reads" + total_reads: "Nr. Dedup. Total Reads" + avg_gc: "% GC Dedup. Mapped Reads" + Bcftools Stats: + number_of_records: "Nr. Overall Variants" + number_of_SNPs: "Nr. SNPs" + number_of_indels: "Nr. InDels" + MALT: + Mappability: "% Metagenomic Mappability" + SexDetErrmine: + RateErrX: "SexDet Err X Chr" + RateErrY: "SexDet Err Y Chr" + RateX: "SexDet Rate X Chr" + RateY: "SexDet Rate Y Chr" diff --git a/docs/README.md b/docs/README.md index 64eeacd97..e7a82cd74 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,22 +2,21 @@ The nf-core/eager documentation is split into the following pages: -* [Usage](usage.md) - * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - * Also includes: FAQ, Troubleshooting and Tutorials -* [Output](output.md) - * An overview of the different results produced by the pipeline and how to interpret them. +- [Usage](usage.md) + - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. + - Also includes: FAQ, Troubleshooting and Tutorials +- [Output](output.md) + - An overview of the different results produced by the pipeline and how to interpret them. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re). Additional pages are: -* [Installation](https://nf-co.re/usage/installation) -* Pipeline configuration - * [Local installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) -* [Contribution Guidelines](../.github/CONTRIBUTING.md) - * Basic contribution & behaviour guidelines - * Checklists and guidelines for people who would like to contribute code - \ No newline at end of file +- [Installation](https://nf-co.re/usage/installation) +- Pipeline configuration + - [Local installation](https://nf-co.re/usage/local_installation) + - [Adding your own system config](https://nf-co.re/usage/adding_own_config) + - [Reference genomes](https://nf-co.re/usage/reference_genomes) +- [Contribution Guidelines](../.github/CONTRIBUTING.md) + - Basic contribution & behaviour guidelines + - Checklists and guidelines for people who would like to contribute code diff --git a/docs/output.md b/docs/output.md index 8acdbe1d4..a9fbda3d9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -19,25 +19,25 @@ results/ work/ ``` -* The parent directory `` is the parent directory of the run, either the directory the pipeline was run from or as specified by the `--outdir` flag. The default name of the output directory (unless otherwise specified) will be `./results/`. +- The parent directory `` is the parent directory of the run, either the directory the pipeline was run from or as specified by the `--outdir` flag. The default name of the output directory (unless otherwise specified) will be `./results/`. ### Primary Output Directories These directories are the ones you will use on a day-to-day basis and are those which you should familiarise yourself with. -* The `MultiQC` directory is the most important directory and contains the main summary report of the run in HTML format, which can be viewed in a web-browser of your choice. The sub-directory contains the MultiQC collected data used to build the HTML report. The Report allows you to get an overview of the sequencing and mapping quality as well as aDNA metrics (see the [MultiQC Report](#multiqc-report) section for more detail). -* A `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular part of the pipeline. +- The `MultiQC` directory is the most important directory and contains the main summary report of the run in HTML format, which can be viewed in a web-browser of your choice. The sub-directory contains the MultiQC collected data used to build the HTML report. The Report allows you to get an overview of the sequencing and mapping quality as well as aDNA metrics (see the [MultiQC Report](#multiqc-report) section for more detail). +- A `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular part of the pipeline. ### Secondary Output Directories These are less important directories which are used less often, normally in the context of bug-reporting. -* `pipeline_info/`: [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. - * Documentation for interpretation of results in HTML format: `results_description.html`. -* `reference_genome/` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxiliary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not important. -* The `work/` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. +- `pipeline_info/`: [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + - Documentation for interpretation of results in HTML format: `results_description.html`. +- `reference_genome/` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxiliary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not important. +- The `work/` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. > :warning: Note that `work/` will be created wherever you are running the `nextflow run` command from, unless you specify the location with `-w`, i.e. it will not by default be in `outdir`!. @@ -61,41 +61,41 @@ Each column name is supplied by the module, so you may see similar column names. The possible columns displayed by default are as follows (note you may see additional columns depending on what other modules you activate): -* **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. -* **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. -* **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. -* **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. -* **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC %GC column. -* **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. -* **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. -* **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. -* **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules -* **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below). -* **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. -* **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias. -* **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping. -* **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. -* **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. -* **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) -* **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. -* **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. -* **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). -* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. -* **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -* **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -* **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. -* **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position. -* **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. -* **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. -* **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. -* **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. -* **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. -* **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller. -* **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. -* **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. -* **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button. -* **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button. -* **% Hets** This is from MultiVCFAnalyzer. This reports the number of SNPs on an assumed haploid organism that have two possible alleles. A high percentage may indicate cross-mapping from a related species. +- **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. +- **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. +- **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. +- **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. +- **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC %GC column. +- **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. +- **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. +- **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. +- **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules +- **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below). +- **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. +- **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias. +- **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping. +- **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. +- **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. +- **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) +- **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. +- **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. +- **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). +- **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. +- **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +- **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +- **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. +- **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position. +- **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. +- **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. +- **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. +- **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. +- **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. +- **#SNPs Covered** This is from eigenstrat_snp_coverage. The number of called SNPs after genotyping with pileupcaller. +- **#SNPs Total** This is from eigenstrat_snp_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. +- **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. +- **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button. +- **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button. +- **% Hets** This is from MultiVCFAnalyzer. This reports the number of SNPs on an assumed haploid organism that have two possible alleles. A high percentage may indicate cross-mapping from a related species. For other non-default columns (activated under 'Configure Columns'), hover over the column name for further descriptions. @@ -107,13 +107,13 @@ For other non-default columns (activated under 'Configure Columns'), hover over You will receive output for each supplied FASTQ file. -When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will *normally* be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). +When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will _normally_ be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. +> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. > :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming. #### Sequence Counts @@ -138,10 +138,10 @@ You will often see that the first 5 or so bases have slightly lower quality than Things to watch out for: -* all positions having Phred scores less than 27 -* a sharp drop-off of quality early in the read -* for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. - +- all positions having Phred scores less than 27 +- a sharp drop-off of quality early in the read +- for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. + #### Per Sequence Quality Scores This is a further summary of the previous plot. This is a histogram of the _overall_ read quality (compared to per-base, above). The x axis is the mean read-quality score (summarising all the bases of the read in a single value), and the y-axis is the number of reads with this Phred score. You should see a peak with the majority of your reads between 27-35. @@ -152,9 +152,9 @@ This is a further summary of the previous plot. This is a histogram of the _over Things to watch out for: -* bi-modal peaks which suggests artefacts in some of the sequencing cycles -* all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). - +- bi-modal peaks which suggests artefacts in some of the sequencing cycles +- all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). + #### Per Base Sequencing Content This is a heatmap which shows the average percentage of C, G, T, and A nucleotides across ~4bp bins across all reads. @@ -167,7 +167,7 @@ You expect to see whole heatmap to be a relatively equal block of colour (normal Things to watch out for: -* If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). +- If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). > If you see Poly-G tails, we recommend to turn on FastP poly-G trimming with EAGER. See the 'running' documentation page for details. @@ -181,7 +181,7 @@ This line graph shows the number percentage reads (y-axis) with an average perce Things to watch out for: -* If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running nf-core/eager using the poly-G trimming option from `fastp` See the 'running' documentation page for details. +- If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running nf-core/eager using the poly-G trimming option from `fastp` See the 'running' documentation page for details. #### Per Base N Content @@ -251,13 +251,13 @@ The pipeline will generate the respective output for each supplied FASTQ file. This line plot shows the average GC content (Y axis) across each nucleotide of the reads (X-axis). There are two buttons per read (i.e. 2 for single-end, and 4 for paired-end) representing before and after the poly-G tail trimming. -Before filtering, if you have poly-G tails, you should see the lines going up at the end of the right-hand side of the plot. +Before filtering, if you have poly-G tails, you should see the lines going up at the end of the right-hand side of the plot. After filtering, you should see that the average GC content along the reads is now reduced to around the general trend of the entire read. Things to look out for: -* If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. +- If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. ### AdapterRemoval @@ -265,10 +265,10 @@ Things to look out for: AdapterRemoval a tool that does the post-sequencing clean up of your sequencing reads. It performs the following functions -* 'Merges' (or 'collapses') forward and reverse reads of Paired End data -* Removes remaining library indexing adapters -* Trims low quality base tails from ends of reads -* Removes too-short reads +- 'Merges' (or 'collapses') forward and reverse reads of Paired End data +- Removes remaining library indexing adapters +- Trims low quality base tails from ends of reads +- Removes too-short reads In more detail merging is where the same read from the forward and reverse files of a single library (based on the flowcell coordinates), are compared to find a stretch of sequence that are the same. If this overlap reaches certain quality thresholds, the two reads are 'collapsed' into a single read, with the base quality scores are updated accordingly accounting for the increase quality call precision. @@ -284,14 +284,14 @@ You will receive output for each FASTQ file supplied for single end data, or for These stacked bars plots are unfortunately a little confusing, when displayed in MultiQC. However are relatively straight-forward once you understand each category. They can be displayed as counts of reads per AdapterRemoval read-category, or as percentages of the same values. Each forward(/reverse) file combination are displayed once. -The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar *includes* the other categories displayed (see below) in the calculation. +The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar _includes_ the other categories displayed (see below) in the calculation. Other Categories: -* If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. -* If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. -* If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. -* **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria. +- If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. +- If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. +- If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. +- **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria.

@@ -305,11 +305,11 @@ If you see high numbers of discarded or truncated reads, you should check your F The length distribution plots show the number of reads at each read-length. You can change the plot to display different categories. -* All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. -* **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing -* **Singleton** represent those reads that had a one member of a pair discarded -* **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. -* **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to. +- All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. +- **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing +- **Singleton** represent those reads that had a one member of a pair discarded +- **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. +- **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to.

@@ -323,7 +323,7 @@ With paired-end ancient DNA sequencing runs You expect to see a slight increase This module provides information on mapping when running the Bowtie2 aligner. Bowtie2, like bwa, takes raw FASTQ reads and finds the most likely place on the reference genome it derived from. While this module is somewhat redundant with the [Samtools](#samtools) (which reports mapping statistics for bwa) and the endorSp.y endogenous DNA value in the general statistics table, it does provide some details that could be useful in certain contexts. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Single/Paired-end alignments @@ -343,7 +343,7 @@ The main additional useful information compared to [Samtools](#samtools) is that MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Metagenomic Mappability @@ -355,7 +355,7 @@ Due to low 'endogenous' content of aDNA, and the high biodiversity of modern or

- This can also be influenced by the type of database you supplied — many databases have an over-abundance of taxa of clinical or economic interest, so when you have a large amount of uncharacterised environmental taxa, this may also result in low mappability. +This can also be influenced by the type of database you supplied — many databases have an over-abundance of taxa of clinical or economic interest, so when you have a large amount of uncharacterised environmental taxa, this may also result in low mappability. #### Taxonomic assignment success @@ -378,7 +378,7 @@ Kraken is another metagenomic classifier, but takes a different approach to alig It is useful when you do not have large computing power or you want very rapid but rough approximation of the metagenomic profile of your sample. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Top Taxa @@ -396,7 +396,7 @@ However for screening for specific metagenomic profiles, such as ancient microbi This module provides numbers in raw counts of the mapping of your DNA reads to your reference genome. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Flagstat Plot @@ -416,7 +416,7 @@ The remaining rows will be 0 when running `bwa aln` as these characteristics of ### DeDup -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Background @@ -426,15 +426,15 @@ DeDup is a duplicate removal tool which searches for PCR duplicates and removes This stacked bar plot shows as a whole the total number of reads in the BAM file going into DeDup. The different sections of a given bar represents the following: -* **Not Removed** — the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). -* **Reverse Removed** — the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). -* **Forward Removed** — the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). -* **Merged Removed** — the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). - +- **Not Removed** — the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). +- **Reverse Removed** — the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). +- **Forward Removed** — the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). +- **Merged Removed** — the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). + Exceptions to the above: -* If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. -* If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections. +- If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. +- If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections.

@@ -442,8 +442,8 @@ Exceptions to the above: Things to look out for: -* The smaller the number of the duplicates removed the better. If you have a small number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. -* If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. +- The smaller the number of the duplicates removed the better. If you have a small number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. +- If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. ### Picard @@ -463,8 +463,8 @@ The amount of unmapped reads will depend on whether you have filtered out unmapp Things to look out for: -* The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. -* If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, a badly preserved sample with a very low yield, or a lot of left-over adapters that were able to map to your genome. +- The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. +- If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, a badly preserved sample with a very low yield, or a lot of left-over adapters that were able to map to your genome. ### Preseq @@ -476,7 +476,7 @@ There are two algorithms from the tools we use: `c_curve` and `lc_extrap`. The f Due to endogenous DNA being so low when doing initial screening, the maths behind `lc_extrap` often fails as there is not enough data. Therefore nf-core/eager sticks with `c_curve` which gives a similar approximation of the library complexity, but is more robust to smaller datasets. -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Complexity Curve @@ -490,9 +490,9 @@ The dashed line represents a 'perfect' library containing only unique molecules Plateauing can be caused by a number of reasons: -* You have simply sequenced your library to exhaustion -* You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio -* You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) +- You have simply sequenced your library to exhaustion +- You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio +- You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) ### DamageProfiler @@ -502,24 +502,24 @@ DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics f Therefore, three main characteristics of ancient DNA are: -* Short DNA fragments -* Elevated G and As (purines) just before strand breaks -* Increased C and Ts at ends of fragments +- Short DNA fragments +- Elevated G and As (purines) just before strand breaks +- Increased C and Ts at ends of fragments + +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. - #### Misincorporation Plots The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. When looking at the misincorporation plots, keep the following in mind: -* As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. -* If your library has been **partially-UDG treated**, only the first one or two bases will display the misincorporation frequency. -* If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. -* If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. -* We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A. -* The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable. +- As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. +- If your library has been **partially-UDG treated**, only the first one or two bases will display the misincorporation frequency. +- If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. +- If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. +- We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A. +- The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable.

@@ -533,9 +533,9 @@ The MultiQC DamageProfiler module length distribution plots show the frequency o When looking at the length distribution plots, keep in mind the following: -* Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. -* You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA -* You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. +- Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. +- You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA +- You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. ### QualiMap @@ -547,7 +547,7 @@ Qualimap is a tool which provides statistics on the quality of the mapping of yo Note that many of the statistics from this module are displayed in the General Stats table (see above), as they represent single values that are not plottable. -You will receive output for each *sample*. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). +You will receive output for each _sample_. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). :warning: If your library has no reads mapping to the reference, this will result in an empty BAM file. Qualimap will therefore not produce any output even if a BAM exists! @@ -563,9 +563,9 @@ The greater the number of bases covered at as high as possible fold coverage, th Things to watch out for: -* You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. -* If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. - +- You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. +- If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. + #### Cumulative Genome Coverage This plot shows how much of the genome in percentage (X axis) is covered by a given fold depth coverage (Y axis). @@ -586,9 +586,9 @@ This plot shows the distribution of the frequency of reads at different GC conte Things to watch out for: -* This plot should normally show a normal distribution around the average GC content of your reference genome. -* Bimodal peaks may represent lab-based artefacts that should be further investigated. -* Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The nf-core/eager trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. +- This plot should normally show a normal distribution around the average GC content of your reference genome. +- Bimodal peaks may represent lab-based artefacts that should be further investigated. +- Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The nf-core/eager trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. ### Sex.DetERRmine @@ -658,7 +658,7 @@ This table shows the contents of the `snpStatistics.tsv` file produced by MultiV #### Call statistics barplot -You can get different variants of the call statistics bar plot, depending on how you configured the MultiVCFAnalyzer options. +You can get different variants of the call statistics bar plot, depending on how you configured the MultiVCFAnalyzer options. If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to two different values (left panel A in the figure below), this allows you to assess the number of multi-allelic positions that were called in your genome. Typically MultiVCFAnalyzer is used for analysing smallish haploid genomes (such as mitochondrial or bacterial genomes), therefore a position with multiple possible 'alleles' suggests some form of cross-mapping from other taxa or presence of multiple strains. If this is the case, you will need to be careful with downstream analysis of the consensus sequence (e.g. for phylogenetic tree analysis) as you may accidentally pick up SNPs from other taxa/strains — particularly when dealing with low coverage data. Therefore if you have a high level of 'het' values (see image), you should carefully check your alignments manually to see how clean your genomes are, or whether you can do some form of strain separation (e.g. by majority/minority calling). @@ -670,40 +670,40 @@ If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to the s ## Output Files -This section gives a brief summary of where to look for what files for downstream analysis. This covers *all* modules. +This section gives a brief summary of where to look for what files for downstream analysis. This covers _all_ modules. Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. -* `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. - * When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. -* `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. -* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. -* `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. -* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). -* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. -* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. -* `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. -* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. -* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. -* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. -* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. -* `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). -* `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. -* `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. -* `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. -* `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. -* `nuclear_contamination/`: this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. -* `bedtools/`: this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). -* `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. -* `metagenomic_classification/`: this contains the output for a given metagenomic classifier. - * Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: - * the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) - * the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. - * finally, the `*.kraken.out` file are the direct output of Kraken2 - * ⚠️ If your sample has no hits, no kraken output files will be created for that sample! -* `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) -* `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. - `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. -* `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) -* `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied. +- `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. + - When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. +- `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. +- `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. +- `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. +- `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). +- `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. +- `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. +- `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. +- `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. +- `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. +- `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. +- `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. +- `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). +- `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. +- `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. +- `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. +- `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. +- `nuclear_contamination/`: this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. +- `bedtools/`: this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). +- `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. +- `metagenomic_classification/`: this contains the output for a given metagenomic classifier. + - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. + - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). _Kmer duplication is defined as: number of kmers / number of unique kmers_. You will find two kraken reports formats available: + - the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) + - the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. + - finally, the `*.kraken.out` file are the direct output of Kraken2 + - ⚠️ If your sample has no hits, no kraken output files will be created for that sample! +- `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) +- `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. + `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. +- `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) +- `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied. diff --git a/docs/usage.md b/docs/usage.md index 454b10a93..683dacfba 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -36,8 +36,7 @@ results # Finished results (configurable, see below) # Other Nextflow hidden files, eg. history of pipeline runs and old logs. ``` -To see the the nf-core/eager pipeline help message run: `nextflow run -nf-core/eager --help` +To see the the nf-core/eager pipeline help message run: `nextflow run nf-core/eager --help` If you want to configure your pipeline interactively using a graphical user interface, please visit [nf-co.re @@ -92,30 +91,30 @@ They are loaded in sequence, so later profiles can overwrite earlier profiles. If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. -* `docker` - * A generic configuration profile to be used with [Docker](https://docker.com/) - * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -* `singularity` - * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) - * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -* `podman` - * A generic configuration profile to be used with [Podman](https://podman.io/) - * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -* `shifter` - * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -* `charliecloud` - * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) - * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -* `conda` - * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. - * A generic configuration profile to be used with [Conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) -* `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters - -> *Important*: If running nf-core/eager on a cluster - ask your system +- `docker` + - A generic configuration profile to be used with [Docker](https://docker.com/) + - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +- `singularity` + - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +- `podman` + - A generic configuration profile to be used with [Podman](https://podman.io/) + - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +- `shifter` + - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +- `charliecloud` + - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +- `conda` + - Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/) + - Pulls most software from [Bioconda](https://bioconda.github.io/) +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters + +> _Important_: If running nf-core/eager on a cluster - ask your system > administrator what profile to use. **Institution Specific Profiles** These are profiles specific to certain **HPC @@ -124,17 +123,17 @@ clusters**, and are centrally maintained at regular users of nf-core/eager, if you don't see your own institution here check the [nf-core/configs](https://github.com/nf-core/configs) repository. -* `uzh` - * A profile for the University of Zurich Research Cloud - * Loads Singularity and defines appropriate resources for running the +- `uzh` + - A profile for the University of Zurich Research Cloud + - Loads Singularity and defines appropriate resources for running the pipeline. -* `binac` - * A profile for the BinAC cluster at the University of Tuebingen 0 Loads +- `binac` + - A profile for the BinAC cluster at the University of Tuebingen 0 Loads Singularity and defines appropriate resources for running the pipeline -* `shh` - * A profile for the S/CDAG cluster at the Department of Archaeogenetics of +- `shh` + - A profile for the S/CDAG cluster at the Department of Archaeogenetics of the Max Planck Institute for the Science of Human History - * Loads Singularity and defines appropriate resources for running the pipeline + - Loads Singularity and defines appropriate resources for running the pipeline **Pipeline Specific Institution Profiles** There are also pipeline-specific institution profiles. I.e., we can also offer a profile which sets special @@ -145,10 +144,10 @@ pipelines. This can be seen at We currently offer a nf-core/eager specific profile for -* `shh` - * A profiler for the S/CDAG cluster at the Department of Archaeogenetics of +- `shh` + - A profiler for the S/CDAG cluster at the Department of Archaeogenetics of the Max Planck Institute for the Science of Human History - * In addition to the nf-core wide profile, this also sets the MALT resources + - In addition to the nf-core wide profile, this also sets the MALT resources to match our commonly used databases Further institutions can be added at @@ -259,7 +258,7 @@ There are two possible ways of supplying input sequencing data to nf-core/eager. This method is where you specify with `--input`, the path locations of FASTQ (optionally gzipped) or BAM file(s). This option is mutually exclusive to the [TSV input method](#tsv-input-method), which is used for more complex input configurations such as lane and library merging. -When using the direct method of `--input` you can specify one or multiple samples in one or more directories files. File names **must be unique**, even if in different directories. +When using the direct method of `--input` you can specify one or multiple samples in one or more directories files. File names **must be unique**, even if in different directories. By default, the pipeline _assumes_ you have paired-end data. If you want to run single-end data you must specify [`--single_end`]('#single_end') @@ -285,7 +284,7 @@ If you have multiple files in different directories, you can use additional wild 4. When using the pipeline with **paired end data**, the path must use `{1,2}` notation to specify read pairs. 5. Files names must be unique, having files with the same name, but in different directories is _not_ sufficient - * This can happen when a library has been sequenced across two sequencers on the same lane. Either rename the file, try a symlink with a unique name, or merge the two FASTQ files prior input. + - This can happen when a library has been sequenced across two sequencers on the same lane. Either rename the file, try a symlink with a unique name, or merge the two FASTQ files prior input. 6. Due to limitations of downstream tools (e.g. FastQC), sample IDs may be truncated after the first `.` in the name, Ensure file names are unique prior to this! 7. For input BAM files you should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. @@ -303,8 +302,8 @@ The use of the TSV `--input` method is recommended when performing more complex This TSV should look like the following: -| Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | -|-------------|------------|------|------------------|--------|----------|--------------|---------------|----|----|-----| +| Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | +| ----------- | ---------- | ---- | ---------------- | ------- | -------- | ------------ | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | | JK2782 | JK2782 | 1 | 4 | PE | Mammoth | double | full | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz) | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz) | NA | | JK2802 | JK2802 | 2 | 2 | SE | Mammoth | double | full | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz) | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz) | NA | @@ -317,22 +316,22 @@ When using TSV_input, nf-core/eager will merge FASTQ files of libraries with the Column descriptions are as follows: -* **Sample_Name:** A text string containing the name of a given sample of which there can be multiple libraries. All libraries with the same sample name and same SeqType will be merged after deduplication. -* **Library_ID:** A text string containing a given library, which there can be multiple sequencing lanes (with the same SeqType). -* **Lane:** A number indicating which lane the library was sequenced on. Files from the libraries sequenced on different lanes (and different SeqType) will be concatenated after read clipping and merging. -* **Colour Chemistry** A number indicating whether the Illumina sequencer the library was sequenced on was a 2 (e.g. Next/NovaSeq) or 4 (Hi/MiSeq) colour chemistry machine. This informs whether poly-G trimming (if turned on) should be performed. -* **SeqType:** A text string of either 'PE' or 'SE', specifying paired end (with both an R1 [or forward] and R2 [or reverse]) and single end data (only R1 [forward], or BAM). This will affect lane merging if different per library. -* **Organism:** A text string of the organism name of the sample or 'NA'. This currently has no functionality and can be set to 'NA', but will affect lane/library merging if different per library -* **Strandedness:** A text string indicating whether the library type is'single' or 'double'. This will affect lane/library merging if different per library. -* **UDG_Treatment:** A text string indicating whether the library was generated with UDG treatment - either 'full', 'half' or 'none'. Will affect lane/library merging if different per library. -* **R1:** A text string of a file path pointing to a forward or R1 FASTQ file. This can be used with the R2 column. File names **must be unique**, even if they are in different directories. -* **R2:** A text string of a file path pointing to a reverse or R2 FASTQ file, or 'NA' when single end data. This can be used with the R1 column. File names **must be unique**, even if they are in different directories. -* **BAM:** A text string of a file path pointing to a BAM file, or 'NA'. Cannot be specified at the same time as R1 or R2, both of which should be set to 'NA' +- **Sample_Name:** A text string containing the name of a given sample of which there can be multiple libraries. All libraries with the same sample name and same SeqType will be merged after deduplication. +- **Library_ID:** A text string containing a given library, which there can be multiple sequencing lanes (with the same SeqType). +- **Lane:** A number indicating which lane the library was sequenced on. Files from the libraries sequenced on different lanes (and different SeqType) will be concatenated after read clipping and merging. +- **Colour Chemistry** A number indicating whether the Illumina sequencer the library was sequenced on was a 2 (e.g. Next/NovaSeq) or 4 (Hi/MiSeq) colour chemistry machine. This informs whether poly-G trimming (if turned on) should be performed. +- **SeqType:** A text string of either 'PE' or 'SE', specifying paired end (with both an R1 [or forward] and R2 [or reverse]) and single end data (only R1 [forward], or BAM). This will affect lane merging if different per library. +- **Organism:** A text string of the organism name of the sample or 'NA'. This currently has no functionality and can be set to 'NA', but will affect lane/library merging if different per library +- **Strandedness:** A text string indicating whether the library type is'single' or 'double'. This will affect lane/library merging if different per library. +- **UDG_Treatment:** A text string indicating whether the library was generated with UDG treatment - either 'full', 'half' or 'none'. Will affect lane/library merging if different per library. +- **R1:** A text string of a file path pointing to a forward or R1 FASTQ file. This can be used with the R2 column. File names **must be unique**, even if they are in different directories. +- **R2:** A text string of a file path pointing to a reverse or R2 FASTQ file, or 'NA' when single end data. This can be used with the R1 column. File names **must be unique**, even if they are in different directories. +- **BAM:** A text string of a file path pointing to a BAM file, or 'NA'. Cannot be specified at the same time as R1 or R2, both of which should be set to 'NA' For example, the following TSV table: | Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | -|-------------|------------|------|------------------|---------|----------|--------------|---------------|----------------------------------------------------------------|----------------------------------------------------------------|-----| +| ----------- | ---------- | ---- | ---------------- | ------- | -------- | ------------ | ------------- | -------------------------------------------------------------- | -------------------------------------------------------------- | --- | | JK2782 | JK2782 | 7 | 4 | PE | Mammoth | double | full | data/JK2782_TGGCCGATCAACGA_L007_R1_001.fastq.gz.tengrand.fq.gz | data/JK2782_TGGCCGATCAACGA_L007_R2_001.fastq.gz.tengrand.fq.gz | NA | | JK2782 | JK2782 | 8 | 4 | PE | Mammoth | double | full | data/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz | data/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz | NA | | JK2802 | JK2802 | 7 | 4 | PE | Mammoth | double | full | data/JK2802_AGAATAACCTACCA_L007_R1_001.fastq.gz.tengrand.fq.gz | data/JK2802_AGAATAACCTACCA_L007_R2_001.fastq.gz.tengrand.fq.gz | NA | @@ -340,35 +339,35 @@ For example, the following TSV table: will have the following effects: -* After AdapterRemoval, and prior to mapping, FASTQ files from lane 7 and lane 8 _with the same `SeqType`_ (and all other _metadata_ columns) will be concatenated together for each **Library**. -* After mapping, and prior BAM filtering, BAM files with different `SeqType` (but with all other metadata columns the same) will be merged together for each **Library**. -* After duplicate removal, BAM files with different `Library_ID`s but with the same `Sample_Name` and the same `UDG_Treatment` will be merged together. -* If BAM trimming is turned on, all post-trimming BAMs (i.e. non-UDG and half-UDG ) will be merged with UDG-treated (untreated) BAMs, if they have the same `Sample_Name`. +- After AdapterRemoval, and prior to mapping, FASTQ files from lane 7 and lane 8 _with the same `SeqType`_ (and all other _metadata_ columns) will be concatenated together for each **Library**. +- After mapping, and prior BAM filtering, BAM files with different `SeqType` (but with all other metadata columns the same) will be merged together for each **Library**. +- After duplicate removal, BAM files with different `Library_ID`s but with the same `Sample_Name` and the same `UDG_Treatment` will be merged together. +- If BAM trimming is turned on, all post-trimming BAMs (i.e. non-UDG and half-UDG ) will be merged with UDG-treated (untreated) BAMs, if they have the same `Sample_Name`. Note the following important points and limitations for setting up: -* The TSV must use actual tabs (not spaces) between cells. -* The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. -* *File* names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). - * At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. - * Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. - * If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. -* Lane IDs must be unique for each sequencing of each library. - * If you have a library sequenced e.g. on Lane 8 of two HiSeq runs, you can give a fake lane ID (e.g. 20) for one of the FASTQs, and the libraries will still be processed correctly. - * This also applies to the SeqType column, i.e. with the example above, if one run is PE and one run is SE, you need to give fake lane IDs to one of the runs as well. -* All _BAM_ files must be specified as `SE` under `SeqType`. - * You should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. -* nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration -* Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together. -* nf-core/eager is able to correctly handle libraries that are sequenced multiple times on different sequencing configurations (i.e mixtures of single- and paired-end data). These will be merged after mapping and considered 'paired-end' during downstream processes. - * **Important** we do not recommend choosing to use DeDup (i.e. `--dedupper 'dedup'`) when mixing PE and SE data, as SE data will not necessarily have the correct end position of the read, and DeDup requires both ends of the molecule to remove a duplicate read. Therefore you may end up with inflated (false-positive) coverages due to suboptimal deduplication. - * When you wish to run PE/SE data together, the default `-dedupper markduplicates` is therefore preferred, as it only looks at the first position. While more conservative (i.e. it'll remove more reads even if not technically duplicates, because it assumes it can't see the true ends of molecules), it is more consistent. - * An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`. - * If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager - * If you _regularly_ want to run the situation above, please leave a feature request on github. -* DamageProfiler, NuclearContamination, MTtoNucRatio and PreSeq are performed on each unique library separately after deduplication (but prior same-treated library merging). -* nf-core/eager functionality such as `--run_trim_bam` will be applied to only non-UDG (UDG_Treatment: none) or half-UDG (UDG_Treatment: half) libraries. - Qualimap is run on each sample, after merging of libraries (i.e. your values will reflect the values of all libraries combined - after being damage trimmed etc.). -* Genotyping will be typically performed on each `sample` independently, as normally all libraries will have been merged together. However, if you have a mixture of single-stranded and double-stranded libraries, you will normally need to genotype separately. In this case you **must** give each the SS and DS libraries _distinct_ `Sample_IDs`; otherwise you will receive a `file collision` error in steps such as `sexdeterrmine`, and then you will need to merge these yourself. We will consider changing this behaviour in the future if there is enough interest. +- The TSV must use actual tabs (not spaces) between cells. +- The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. +- _File_ names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). + - At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. + - Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. + - If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. +- Lane IDs must be unique for each sequencing of each library. + - If you have a library sequenced e.g. on Lane 8 of two HiSeq runs, you can give a fake lane ID (e.g. 20) for one of the FASTQs, and the libraries will still be processed correctly. + - This also applies to the SeqType column, i.e. with the example above, if one run is PE and one run is SE, you need to give fake lane IDs to one of the runs as well. +- All _BAM_ files must be specified as `SE` under `SeqType`. + - You should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. +- nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration +- Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together. +- nf-core/eager is able to correctly handle libraries that are sequenced multiple times on different sequencing configurations (i.e mixtures of single- and paired-end data). These will be merged after mapping and considered 'paired-end' during downstream processes. + - **Important** we do not recommend choosing to use DeDup (i.e. `--dedupper 'dedup'`) when mixing PE and SE data, as SE data will not necessarily have the correct end position of the read, and DeDup requires both ends of the molecule to remove a duplicate read. Therefore you may end up with inflated (false-positive) coverages due to suboptimal deduplication. + - When you wish to run PE/SE data together, the default `-dedupper markduplicates` is therefore preferred, as it only looks at the first position. While more conservative (i.e. it'll remove more reads even if not technically duplicates, because it assumes it can't see the true ends of molecules), it is more consistent. + - An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`. + - If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager + - If you _regularly_ want to run the situation above, please leave a feature request on github. +- DamageProfiler, NuclearContamination, MTtoNucRatio and PreSeq are performed on each unique library separately after deduplication (but prior same-treated library merging). +- nf-core/eager functionality such as `--run_trim_bam` will be applied to only non-UDG (UDG_Treatment: none) or half-UDG (UDG_Treatment: half) libraries. - Qualimap is run on each sample, after merging of libraries (i.e. your values will reflect the values of all libraries combined - after being damage trimmed etc.). +- Genotyping will be typically performed on each `sample` independently, as normally all libraries will have been merged together. However, if you have a mixture of single-stranded and double-stranded libraries, you will normally need to genotype separately. In this case you **must** give each the SS and DS libraries _distinct_ `Sample_IDs`; otherwise you will receive a `file collision` error in steps such as `sexdeterrmine`, and then you will need to merge these yourself. We will consider changing this behaviour in the future if there is enough interest. ## Clean up @@ -408,7 +407,7 @@ hard drive footprint of the run, so be sure to do this! When using TSV input, nf-core/eager will attempt to merge all `Lanes` of a `Library_ID`, or all files with the same `Library_ID` or `Sample_ID`. However, -if you have specified the same `Lane` or `Library_ID` for two sets of FASTQ +if you have specified the same `Lane` or `Library_ID` for two sets of FASTQ files you will likely receive an error such as ```bash @@ -430,7 +429,7 @@ In some cases it maybe no output log is produced by a particular tool for MultiQ Known cases include: -* Qualimap: there will be no MultiQC output if the BAM file is empty. An empty BAM file is produced when no reads map to the reference and causes Qualimap to crash - this is crash is ignored by nf-core/eager (to allow the rest of the pipeline to continue) and will therefore have no log file for that particular sample/library +- Qualimap: there will be no MultiQC output if the BAM file is empty. An empty BAM file is produced when no reads map to the reference and causes Qualimap to crash - this is crash is ignored by nf-core/eager (to allow the rest of the pipeline to continue) and will therefore have no log file for that particular sample/library ## Tutorials @@ -547,10 +546,10 @@ If you change into this with `cd` and run `ls -la` you should see a collection of normal files, symbolic links (symlinks) and hidden files (indicated with `.` at the beginning of the file name). -* Symbolic links: are typically input files from previous processes. -* Normal files: are typically successfully completed output files from some of +- Symbolic links: are typically input files from previous processes. +- Normal files: are typically successfully completed output files from some of some of the commands in the process -* Hidden files are Nextflow generated files and include the submission commands +- Hidden files are Nextflow generated files and include the submission commands as well as log files When you have an error run, you can firstly check the contents of the output @@ -565,9 +564,7 @@ screen if you were running the command/program yourself. Again, view these with e.g. `cat` and see if you can identify the error of the program itself. Finally, you can also try running the commands _yourself_. You can firstly try -to do this by loading your given nf-core/eager environment (e.g. `singularity -shell /\/\/nf-core-eager-X-X-X.img` or `conda activate -nf-core-eager-X.X.X`), then running `bash .command.sh`. +to do this by loading your given nf-core/eager environment (e.g. `singularity shell /\/\/nf-core-eager-X-X-X.img` or `conda activate nf-core-eager-X.X.X`), then running `bash .command.sh`. If this doesn't work, this suggests either there is something wrong with the nf-core/eager environment configuration, _or_ there is still a problem with the @@ -586,7 +583,7 @@ the #eager channel). #### Tutorial Profiles - Background -A useful feature of Nextflow is the ability to use configuration *profiles* that +A useful feature of Nextflow is the ability to use configuration _profiles_ that can specify many default parameters and other settings on how to run your pipeline. @@ -607,9 +604,9 @@ DNA to map and cause false positive SNP calls. Within nf-core, there are two main levels of configs -* Institutional-level profiles: these normally define things like paths to +- Institutional-level profiles: these normally define things like paths to common storage, resource maximums, scheduling system -* Pipeline-level profiles: these normally define parameters specifically for a +- Pipeline-level profiles: these normally define parameters specifically for a pipeline (such as mapping parameters, turning specific modules on or off) As well as allowing more efficiency and control at cluster or Institutional @@ -617,7 +614,7 @@ levels in terms of memory usage, pipeline-level profiles can also assist in facilitating reproducible science by giving a way for researchers to 'publish' their exact pipeline parameters in way other users can automatically re-run the pipeline with the pipeline parameters used in the original publication but on -their *own* cluster. +their _own_ cluster. To illustrate this, lets say we analysed our data on a HPC called 'blue' for which an institutional profile already exists, and for our analysis we defined a @@ -667,11 +664,11 @@ This would be translated as follows. If your parameters looked like the following -| Parameter | Resolved Parameters | institution | cluster | my_paper | -| ----------------|------------------------|-------------|----------|----------| -| --executor | singularity | singularity | \ | \ | -| --max_memory | 256GB | 756GB | 256GB | \ | -| --bwa_aln | 0.1 | \ | 0.01 | 0.1 | +| Parameter | Resolved Parameters | institution | cluster | my_paper | +| ------------ | ------------------- | ----------- | -------- | -------- | +| --executor | singularity | singularity | \ | \ | +| --max_memory | 256GB | 756GB | 256GB | \ | +| --bwa_aln | 0.1 | \ | 0.01 | 0.1 | (where '\' is a parameter not defined in a given profile.) @@ -689,7 +686,7 @@ defined in the `cluster` profile. > institutional-level profiles. Otherwise please skip to [Writing your own profile](#tutorial-profiles---writing-your-own-profile) In actuality, a nf-core/eager run already contains many configs and profiles, -and will normally use *multiple* configs profiles in a single run. Multiple +and will normally use _multiple_ configs profiles in a single run. Multiple configuration and profiles files can be used, and each new one selected will inherit all the previous one's parameters, and the parameters in the new one will then overwrite any that have been changed from the original. @@ -727,7 +724,7 @@ nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna < In the background, any parameters in the pipeline's `nextflow.config` (containing default parameters) will be overwritten by the -`old_dna_profile.config`. In addition, the `old_dna` *profile* will overwrite +`old_dna_profile.config`. In addition, the `old_dna` _profile_ will overwrite any parameters set in the config but outside the profile definition of `old_dna_profile.config`. @@ -747,13 +744,13 @@ the `hpc_blue` profile, but the `mapper` parameter has been changed from The order of loading of different configuration files can be seen here: | Loading Order | Configuration File | -| -------------:|:----------------------------------------------------------------------------------------------------------------| -| 1 | `nextflow.config` in your current directory | -| 2 | (if using a script for `nextflow run`) a `nextflow.config` in the directory the script is located | -| 3 | `config` stored in your human directory under `~/.nextflow/` | -| 4 | `.config` if you specify in the `nextflow run` command with `-c` | -| 5 | general nf-core institutional configurations stored at [nf-core/configs](https://github.com/nf-core/configs) | -| 6 | pipeline-specific nf-core institutional configurations at [nf-core/configs](https://github.com/nf-core/configs) | +| ------------: | :-------------------------------------------------------------------------------------------------------------- | +| 1 | `nextflow.config` in your current directory | +| 2 | (if using a script for `nextflow run`) a `nextflow.config` in the directory the script is located | +| 3 | `config` stored in your human directory under `~/.nextflow/` | +| 4 | `.config` if you specify in the `nextflow run` command with `-c` | +| 5 | general nf-core institutional configurations stored at [nf-core/configs](https://github.com/nf-core/configs) | +| 6 | pipeline-specific nf-core institutional configurations at [nf-core/configs](https://github.com/nf-core/configs) | This loading order of these `.config` files will not normally affect the settings you use for the pipeline run itself; `-profiles` are normally more @@ -764,7 +761,7 @@ if your run does not use the parameters you expect. > specifying a custom `.config` file by using `-C` (capital C) instead of `-c` > (which inherits previously specify parameters) -Another thing that is important to note is that if a specific *profile* is +Another thing that is important to note is that if a specific _profile_ is specified in `nextflow run`, this replaces any 'global' parameter that is specified within the config file (but outside a profile) itself - **regardless** of profile order (see above). @@ -782,7 +779,7 @@ params { // Specific nf-core/configs params config_profile_contact = 'James Fellows Yates (@jfy133)' config_profile_description = 'nf-core/eager SHH profile provided by nf-core/configs' - + // default BWA bwaalnn = 0.04 bwaalnl = 32 @@ -804,8 +801,7 @@ profiles { ``` If you run with `nextflow run -profile shh` to specify to use an -institutional-level nf-core config, the parameters will be read as `--bwaalnn -0.04` and `--bwaalnl 32` as these are the default 'fall back' params as +institutional-level nf-core config, the parameters will be read as `--bwaalnn 0.04` and `--bwaalnl 32` as these are the default 'fall back' params as indicated in the example above. If you specify as `nextflow run -profile shh,pathogen_loose`, as expected @@ -1467,59 +1463,59 @@ For example, I normally look for things like: General Stats Table: -* Do I see the expected number of raw sequencing reads (summed across each set +- Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -* Does the percentage of trimmed reads look normal for aDNA, and do lengths +- Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -* Does ClusterFactor or 'Dups' look high (e.g. >2 or >10% respectively) +- Does ClusterFactor or 'Dups' look high (e.g. >2 or >10% respectively) suggesting over-amplified or badly preserved samples? -* Do the mapped reads show increased frequency of C>Ts on the 5' end of +- Do the mapped reads show increased frequency of C>Ts on the 5' end of molecules? -* Is the number of SNPs used for nuclear contamination really low for any +- Is the number of SNPs used for nuclear contamination really low for any individuals (e.g. < 100)? If so, then the estimates might not be very accurate. FastQC (pre-AdapterRemoval): -* Do I see any very early drop off of sequence quality scores suggesting a +- Do I see any very early drop off of sequence quality scores suggesting a problematic sequencing run? -* Do I see outlier GC content distributions? -* Do I see high sequence duplication levels? +- Do I see outlier GC content distributions? +- Do I see high sequence duplication levels? AdapterRemoval: -* Do I see high numbers of singletons or discarded read pairs? +- Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -* Do I see improved sequence quality scores along the length of reads? -* Do I see reduced adapter content levels? +- Do I see improved sequence quality scores along the length of reads? +- Do I see reduced adapter content levels? Samtools Flagstat (pre/post Filter): -* Do I see outliers, e.g. with unusually high levels of human DNA, (indicative +- Do I see outliers, e.g. with unusually high levels of human DNA, (indicative of contamination) that require downstream closer assessment? Are your samples exceptionally preserved? If not, a value higher than e.g. 50% might require your attention. DeDup/Picard MarkDuplicates: -* Do I see large numbers of duplicates being removed, possibly indicating +- Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? DamageProfiler: -* Do I see evidence of damage on human DNA? - * High numbers of mapped reads but no damage may indicate significant +- Do I see evidence of damage on human DNA? + - High numbers of mapped reads but no damage may indicate significant modern contamination. - * Was the read trimming I specified enough to overcome damage effects? + - Was the read trimming I specified enough to overcome damage effects? SexDetERRmine: -* Do the relative coverages on the X and Y chromosome fall within the expected +- Do the relative coverages on the X and Y chromosome fall within the expected areas of the plot? -* Do all individuals have enough data for accurate sex determination? -* Do the proportions of autosomal/X/Y reads make sense? If there is an +- Do all individuals have enough data for accurate sex determination? +- Do the proportions of autosomal/X/Y reads make sense? If there is an overrepresentation of reads within one bin, is the data enriched for that bin? > Detailed documentation and descriptions for all MultiQC modules can be seen in @@ -1619,7 +1615,7 @@ Prior setting up an nf-core/eager run for metagenomic screening, we will need: We should also ensure we have the very latest version of the nf-core/eager pipeline so we have all latest bugfixes etc. In this case we will be using nf-core/eager version 2.2.0. You should always check on the -[nf-core](https://nf-co.re/eager) website whether a newer release has been made +[nf-core](https://nf-co.re/eager) website whether a newer release has been made (particularly point releases e.g. 2.2.1). ```bash @@ -1910,58 +1906,58 @@ For example, I normally look for things like: General Stats Table: -* Do I see the expected number of raw sequencing reads (summed across each set +- Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -* Does the percentage of trimmed reads look normal for aDNA, and do lengths +- Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -* Does ClusterFactor or 'Dups' look high suggesting over-amplified or +- Does ClusterFactor or 'Dups' look high suggesting over-amplified or badly preserved samples (e.g. >2 or >10% respectively - however given this is on the human reads this is just a rule of thumb and may not reflect the quality of the metagenomic profile) ? -* Does the human DNA show increased frequency of C>Ts on the 5' end of +- Does the human DNA show increased frequency of C>Ts on the 5' end of molecules? FastQC (pre-AdapterRemoval): -* Do I see any very early drop off of sequence quality scores suggesting +- Do I see any very early drop off of sequence quality scores suggesting problematic sequencing run? -* Do I see outlier GC content distributions? -* Do I see high sequence duplication levels? +- Do I see outlier GC content distributions? +- Do I see high sequence duplication levels? AdapterRemoval: -* Do I see high numbers of singletons or discarded read pairs? +- Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -* Do I see improved sequence quality scores along the length of reads? -* Do I see reduced adapter content levels? +- Do I see improved sequence quality scores along the length of reads? +- Do I see reduced adapter content levels? MALT: -* Do I have a reasonable level of mappability? - * Somewhere between 10-30% can be pretty normal for aDNA, whereas e.g. <1% +- Do I have a reasonable level of mappability? + - Somewhere between 10-30% can be pretty normal for aDNA, whereas e.g. <1% requires careful manual assessment -* Do I have a reasonable taxonomic assignment success? - * You hope to have a large number of the mapped reads (from the mappability +- Do I have a reasonable taxonomic assignment success? + - You hope to have a large number of the mapped reads (from the mappability plot) that also have taxonomic assignment. Samtools Flagstat (pre/post Filter): -* Do I see outliers, e.g. with unusually high levels of human DNA, (indicative +- Do I see outliers, e.g. with unusually high levels of human DNA, (indicative of contamination) that require downstream closer assessment? DeDup/Picard MarkDuplicates: -* Do I see large numbers of duplicates being removed, possibly indicating +- Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? DamageProfiler: -* Do I see evidence of damage on human DNA? Note this is just a +- Do I see evidence of damage on human DNA? Note this is just a rule-of-thumb/corroboration of any signals you might find in the metagenomic screening and not essential. - * If you have high numbers of human DNA reads but no damage may indicate + - If you have high numbers of human DNA reads but no damage may indicate significant modern contamination. > Detailed documentation and descriptions for all MultiQC modules can be seen in @@ -2086,7 +2082,7 @@ Prior setting up the nf-core/eager run, we will need: We should also ensure we have the very latest version of the nf-core/eager pipeline so we have all latest bugfixes etc. In this case we will be using nf-core/eager version 2.2.0. You should always check on the -[nf-core](https://nf-co.re/eager) website whether a newer release has been made +[nf-core](https://nf-co.re/eager) website whether a newer release has been made (particularly point releases e.g. 2.2.1). ```bash @@ -2532,80 +2528,80 @@ results. For example, I normally look for things like: General Stats Table: -* Do I see the expected number of raw sequencing reads (summed across each set +- Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -* Does the percentage of trimmed reads look normal for aDNA, and do lengths +- Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -* Does the Endogenous DNA (%) columns look reasonable (high enough to indicate +- Does the Endogenous DNA (%) columns look reasonable (high enough to indicate you have received enough coverage for downstream, and/or do you lose an unusually high reads after filtering ) -* Does ClusterFactor or '% Dups' look high (e.g. >2 or >10% respectively - high +- Does ClusterFactor or '% Dups' look high (e.g. >2 or >10% respectively - high values suggesting over-amplified or badly preserved samples i.e. low complexity; note that genome-enrichment libraries may by their nature look higher). -* Do you see an increased frequency of C>Ts on the 5' end of molecules in the +- Do you see an increased frequency of C>Ts on the 5' end of molecules in the mapped reads? -* Do median read lengths look relatively low (normally <= 100 bp) indicating +- Do median read lengths look relatively low (normally <= 100 bp) indicating typically fragmented aDNA? -* Does the % coverage decrease relatively gradually at each depth coverage, and +- Does the % coverage decrease relatively gradually at each depth coverage, and does not drop extremely drastically -* Does the Median coverage and percent >3x (or whatever you set) show sufficient +- Does the Median coverage and percent >3x (or whatever you set) show sufficient coverage for reliable SNP calls and that a good proportion of the genome is covered indicating you have the right reference genome? -* Do you see a high proportion of % Hets, indicating many multi-allelic sites +- Do you see a high proportion of % Hets, indicating many multi-allelic sites (and possibly presence of cross-mapping from other species, that may lead to false positive or less confident SNP calls)? FastQC (pre-AdapterRemoval): -* Do I see any very early drop off of sequence quality scores suggesting +- Do I see any very early drop off of sequence quality scores suggesting problematic sequencing run? -* Do I see outlier GC content distributions? -* Do I see high sequence duplication levels? +- Do I see outlier GC content distributions? +- Do I see high sequence duplication levels? AdapterRemoval: -* Do I see high numbers of singletons or discarded read pairs? +- Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -* Do I see improved sequence quality scores along the length of reads? -* Do I see reduced adapter content levels? +- Do I see improved sequence quality scores along the length of reads? +- Do I see reduced adapter content levels? Samtools Flagstat (pre/post Filter): -* Do I see outliers, e.g. with unusually low levels of mapped reads, (indicative +- Do I see outliers, e.g. with unusually low levels of mapped reads, (indicative of badly preserved samples) that require downstream closer assessment? DeDup/Picard MarkDuplicates: -* Do I see large numbers of duplicates being removed, possibly indicating +- Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? PreSeq: -* Do I see a large drop off of a sample's curve away from the theoretical +- Do I see a large drop off of a sample's curve away from the theoretical complexity? If so, this may indicate it's not worth performing deeper sequencing as you will get few unique reads (vs. duplicates that are not any more informative than the reads you've already sequenced) DamageProfiler: -* Do I see evidence of damage on the microbial DNA (i.e. a % C>T of more than ~5% in +- Do I see evidence of damage on the microbial DNA (i.e. a % C>T of more than ~5% in the first few nucleotide positions?) ? If not, possibly your mapped reads are deriving from modern contamination. QualiMap: -* Do you see a peak of coverage (X) at a good level, e.g. >= 3x, indicating +- Do you see a peak of coverage (X) at a good level, e.g. >= 3x, indicating sufficient coverage for reliable SNP calls? MultiVCFAnalyzer: -* Do I have a good number of called SNPs that suggest the samples have genomes +- Do I have a good number of called SNPs that suggest the samples have genomes with sufficient nucleotide diversity to inform phylogenetic analysis? -* Do you have a large number of discarded SNP calls? -* Are the % Hets very high indicating possible cross-mapping from off-target +- Do you have a large number of discarded SNP calls? +- Are the % Hets very high indicating possible cross-mapping from off-target organisms that may confounding variant calling? > Detailed documentation and descriptions for all MultiQC modules can be seen in diff --git a/nextflow_schema.json b/nextflow_schema.json index ca9f4fe6c..be3132b2e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,1784 +1,1690 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", - "title": "nf-core/eager pipeline parameters", - "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data, and additional metadata.", - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string", - "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", - "fa_icon": "fas fa-dna", - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." - }, - "udg_type": { - "type": "string", - "default": "none", - "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", - "fa_icon": "fas fa-vial", - "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", - "enum": [ - "none", - "half", - "full" - ] - }, - "single_stranded": { - "type": "boolean", - "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", - "fa_icon": "fas fa-minus", - "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" - }, - "single_end": { - "type": "boolean", - "description": "Specifies that the input is single end reads. Not required for TSV input.", - "fa_icon": "fas fa-align-left", - "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" - }, - "colour_chemistry": { - "type": "integer", - "default": 4, - "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", - "fa_icon": "fas fa-palette", - "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." - }, - "bam": { - "type": "boolean", - "description": "Specifies that the input is in BAM format. Not required for TSV input.", - "fa_icon": "fas fa-align-justify", - "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" - } - }, - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." - }, - "input_data_additional_options": { - "title": "Input Data Additional Options", - "type": "object", - "description": "Additional options regarding input data.", - "default": "", - "properties": { - "snpcapture_bed": { - "type": "string", - "fa_icon": "fas fa-magnet", - "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome.", - "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, on-target metrics are automatically generated for you by qualimap." - }, - "run_convertinputbam": { - "type": "boolean", - "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", - "fa_icon": "fas fa-undo-alt", - "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." - } - }, - "fa_icon": "far fa-plus-square" - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "properties": { - "fasta": { - "type": "string", - "fa_icon": "fas fa-font", - "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", - "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" - }, - "genome": { - "type": "string", - "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", - "fa_icon": "fas fa-book", - "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" - }, - "igenomes_base": { - "type": "string", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - }, - "bwa_index": { - "type": "string", - "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", - "fa_icon": "fas fa-address-book", - "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "bt2_index": { - "type": "string", - "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", - "fa_icon": "far fa-address-book", - "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "fasta_index": { - "type": "string", - "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", - "fa_icon": "far fa-bookmark", - "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" - }, - "seq_dict": { - "type": "string", - "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", - "fa_icon": "fas fa-spell-check", - "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" - }, - "large_ref": { - "type": "boolean", - "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", - "fa_icon": "fas fa-mountain", - "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" - }, - "save_reference": { - "type": "boolean", - "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", - "fa_icon": "far fa-save", - "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" - } - }, - "description": "Specify locations of references and optionally, additional pre-made indices", - "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." - }, - "output_options": { - "title": "Output options", - "type": "object", - "description": "Specify where to put output files and optional saving of intermediate files", - "default": "", - "properties": { - "outdir": { - "type": "string", - "description": "The output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open", - "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "hidden": true, - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ] - } - }, - "fa_icon": "fas fa-cloud-download-alt" - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "hidden": true, - "fa_icon": "fas fa-question-circle" - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "An email address to send a summary email to when the pipeline is completed.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "hidden": true, - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true, - "help_text": "Set to receive plain-text e-mails instead of HTML formatted." - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true, - "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true, - "help_text": "Set to disable colourful command line output and live life in monochrome." - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "hidden": true, - "description": "Parameter used for checking conda channels to be set correctly." - }, - "schema_ignore_params": { - "type": "string", - "fa_icon": "fas fa-not-equal", - "description": "String to specify ignored parameters for parameter validation", - "hidden": true, - "default": "genomes" - } - }, - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog", - "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", - "fa_icon": "fas fa-users-cog" - }, - "hostnames": { - "type": "string", - "description": "Institutional configs hostname.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "awsqueue": { - "type": "string", - "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", - "fa_icon": "fab fa-aws" - }, - "awsregion": { - "type": "string", - "default": "eu-west-1", - "description": "The AWS Region for your AWS Batch job to run on", - "fa_icon": "fab fa-aws" - }, - "awscli": { - "type": "string", - "description": "Path to the AWS CLI tool", - "fa_icon": "fab fa-aws" - } - } - }, - "skip_steps": { - "title": "Skip steps", - "type": "object", - "description": "Skip any of the mentioned steps.", - "default": "", - "properties": { - "skip_fastqc": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." - }, - "skip_adapterremoval": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." - }, - "skip_preseq": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the computation of library complexity estimation." - }, - "skip_deduplication": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" - }, - "skip_damage_calculation": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" - }, - "skip_qualimap": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" - } - }, - "fa_icon": "fas fa-fast-forward", - "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." - }, - "complexity_filtering": { - "title": "Complexity filtering", - "type": "object", - "description": "Processing of Illumina two-colour chemistry data.", - "default": "", - "properties": { - "complexity_filter_poly_g": { - "type": "boolean", - "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", - "fa_icon": "fas fa-power-off", - "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" - }, - "complexity_filter_poly_g_min": { - "type": "integer", - "default": 10, - "description": "Specify length of poly-g min for clipping to be performed.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" - } - }, - "fa_icon": "fas fa-filter", - "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" - }, - "read_merging_and_adapter_removal": { - "title": "Read merging and adapter removal", - "type": "object", - "description": "Options for adapter clipping and paired-end merging.", - "default": "", - "properties": { - "clip_forward_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "description": "Specify adapter sequence to be clipped off (forward strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" - }, - "clip_reverse_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", - "description": "Specify adapter sequence to be clipped off (reverse strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" - }, - "clip_adapters_list": { - "type": "string", - "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", - "fa_icon": "fas fa-cut", - "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" - }, - "clip_readlength": { - "type": "integer", - "default": 30, - "description": "Specify read minimum length to be kept for downstream analysis.", - "fa_icon": "fas fa-ruler", - "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" - }, - "clip_min_read_quality": { - "type": "integer", - "default": 20, - "description": "Specify minimum base quality for trimming off bases.", - "fa_icon": "fas fa-medal", - "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" - }, - "min_adap_overlap": { - "type": "integer", - "default": 1, - "description": "Specify minimum adapter overlap required for clipping.", - "fa_icon": "fas fa-hands-helping", - "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" - }, - "skip_collapse": { - "type": "boolean", - "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" - }, - "skip_trim": { - "type": "boolean", - "description": "Skip adapter and quality trimming.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" - }, - "preserve5p": { - "type": "boolean", - "description": "Skip quality base trimming (n, score, window) of 5 prime end.", - "fa_icon": "fas fa-life-ring", - "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" - }, - "mergedonly": { - "type": "boolean", - "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", - "fa_icon": "fas fa-handshake", - "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." - }, - "qualitymax": { - "type": "integer", - "description": "Specify the maximum Phred score used in input FASTQ files", - "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", - "default": 41, - "fa_icon": "fas fa-arrow-up" - }, - "run_post_ar_trimming": { - "type": "boolean", - "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", - "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" - }, - "post_ar_trim_front": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of a merged read or R1", - "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" - }, - "post_ar_trim_tail": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of of a merged read or R1", - "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" - }, - "post_ar_trim_front2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of R2", - "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" - }, - "post_ar_trim_tail2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of R2", - "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" - } - }, - "fa_icon": "fas fa-cut", - "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." - }, - "mapping": { - "title": "Read mapping to reference genome", - "type": "object", - "description": "Options for reference-genome mapping", - "default": "", - "properties": { - "mapper": { - "title": "Mapper", - "type": "string", - "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", - "default": "bwaaln", - "fa_icon": "fas fa-layer-group", - "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", - "enum": [ - "bwaaln", - "bwamem", - "circularmapper", - "bowtie2" - ] - }, - "bwaalnn": { - "type": "number", - "default": 0.01, - "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" - }, - "bwaalnk": { - "type": "integer", - "default": 2, - "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", - "fa_icon": "fas fa-drafting-compass", - "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" - }, - "bwaalnl": { - "type": "integer", - "default": 1024, - "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" - }, - "bwaalno": { - "type": "integer", - "default": 2, - "fa_icon": "fas fa-people-arrows", - "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", - "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" - }, - "circularextension": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only).", - "fa_icon": "fas fa-external-link-alt", - "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" - }, - "circulartarget": { - "type": "string", - "default": "MT", - "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", - "fa_icon": "fas fa-bullseye", - "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" - }, - "circularfilter": { - "type": "boolean", - "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", - "fa_icon": "fas fa-filter", - "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" - }, - "bt2_alignmode": { - "type": "string", - "default": "local", - "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": [ - "local", - "end-to-end" - ] - }, - "bt2_sensitivity": { - "type": "string", - "default": "sensitive", - "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", - "fa_icon": "fas fa-microscope", - "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": [ - "no-preset", - "very-fast", - "fast", - "sensitive", - "very-sensitive" - ] - }, - "bt2n": { - "type": "integer", - "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", - "default": 0 - }, - "bt2l": { - "type": "integer", - "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", - "default": 0 - }, - "bt2_trim5": { - "type": "integer", - "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", - "default": 0 - }, - "bt2_trim3": { - "type": "integer", - "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", - "default": 0 - }, - "bt2_maxins": { - "type": "integer", - "default": 500, - "fa_icon": "fas fa-exchange-alt", - "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", - "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" - } - }, - "fa_icon": "fas fa-layer-group", - "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" - }, - "host_removal": { - "title": "Removal of Host-Mapped Reads", - "type": "object", - "description": "Options for production of host-read removed FASTQ files for privacy reasons.", - "default": "", - "properties": { - "hostremoval_input_fastq": { - "type": "boolean", - "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", - "fa_icon": "fas fa-power-off", - "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" - }, - "hostremoval_mode": { - "type": "string", - "default": "remove", - "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", - "fa_icon": "fas fa-mask", - "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", - "enum": [ - "strip", - "replace", - "remove" - ] - } - }, - "fa_icon": "fas fa-user-shield", - "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." - }, - "bam_filtering": { - "title": "BAM Filtering", - "type": "object", - "description": "Options for quality filtering and how to deal with off-target unmapped reads.", - "default": "", - "properties": { - "run_bam_filtering": { - "type": "boolean", - "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" - }, - "bam_mapping_quality_threshold": { - "type": "integer", - "description": "Minimum mapping quality for reads filter.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", - "default": 0 - }, - "bam_filter_minreadlength": { - "type": "integer", - "fa_icon": "fas fa-ruler-horizontal", - "description": "Specify minimum read length to be kept after mapping.", - "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", - "default": 0 - }, - "bam_unmapped_type": { - "type": "string", - "default": "discard", - "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", - "fa_icon": "fas fa-trash-alt", - "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", - "enum": [ - "discard", - "keep", - "bam", - "fastq", - "both" - ] - } - }, - "fa_icon": "fas fa-sort-amount-down", - "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" - }, - "deduplication": { - "title": "DeDuplication", - "type": "object", - "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", - "default": "", - "properties": { - "dedupper": { - "type": "string", - "default": "markduplicates", - "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", - "fa_icon": "fas fa-object-group", - "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", - "enum": [ - "markduplicates", - "dedup" - ] - }, - "dedup_all_merged": { - "type": "boolean", - "description": "Turn on treating all reads as merged reads.", - "fa_icon": "fas fa-handshake", - "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" - } - }, - "fa_icon": "fas fa-clone", - "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." - }, - "library_complexity_analysis": { - "title": "Library Complexity Analysis", - "type": "object", - "description": "Options for calculating library complexity (i.e. how many unique reads are present).", - "default": "", - "properties": { - "preseq_mode": { - "type": "string", - "default": "c_curve", - "description": "Specify which mode of preseq to run.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", - "enum": [ - "c_curve", - "lc_extrap" - ] - }, - "preseq_step_size": { - "type": "integer", - "default": 1000, - "description": "Specify the step size of Preseq.", - "fa_icon": "fas fa-shoe-prints", - "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" - }, - "preseq_maxextrap": { - "type": "integer", - "default": 10000000000, - "description": "Specify the maximum extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-ban", - "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" - }, - "preseq_terms": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" - }, - "preseq_bootstrap": { - "type": "integer", - "default": 100, - "description": "Specify number of bootstraps to perform (lc_extrap mode only)", - "fa_icon": "fab fa-bootstrap", - "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" - }, - "preseq_cval": { - "type": "number", - "default": 0.95, - "description": "Specify confidence interval level (lc_extrap mode only)", - "fa_icon": "fas fa-check-circle", - "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" - } - }, - "fa_icon": "fas fa-bezier-curve", - "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." - }, - "adna_damage_analysis": { - "title": "(aDNA) Damage Analysis", - "type": "object", - "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", - "default": "", - "properties": { - "damageprofiler_length": { - "type": "integer", - "default": 100, - "description": "Specify length filter for DamageProfiler.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" - }, - "damageprofiler_threshold": { - "type": "integer", - "default": 15, - "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" - }, - "damageprofiler_yaxis": { - "type": "number", - "default": 0.3, - "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", - "fa_icon": "fas fa-ruler-vertical", - "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" - }, - "run_pmdtools": { - "type": "boolean", - "description": "Turn on PMDtools", - "fa_icon": "fas fa-power-off", - "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" - }, - "pmdtools_range": { - "type": "integer", - "default": 10, - "description": "Specify range of bases for PMDTools to scan for damage.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" - }, - "pmdtools_threshold": { - "type": "integer", - "default": 3, - "description": "Specify PMDScore threshold for PMDTools.", - "fa_icon": "fas fa-chart-bar", - "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" - }, - "pmdtools_reference_mask": { - "type": "string", - "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", - "fa_icon": "fas fa-mask", - "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." - }, - "pmdtools_max_reads": { - "type": "integer", - "default": 10000, - "description": "Specify the maximum number of reads to consider for metrics generation.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" - }, - "pmdtools_platypus": { - "type": "boolean", - "description": "Append big list of base frequencies for platypus to output.", - "fa_icon": "fas fa-power-off", - "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" - }, - "run_mapdamage_rescaling": { - "type": "boolean", - "fa_icon": "fas fa-map", - "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" - }, - "rescale_length_5p": { - "type": "integer", - "default": 12, - "fa_icon": "fas fa-balance-scale-right", - "description": "Length of read for mapDamage2 to rescale from 5p end.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." - }, - "rescale_length_3p": { - "type": "integer", - "default": 12, - "fa_icon": "fas fa-balance-scale-left", - "description": "Length of read for mapDamage2 to rescale from 3p end.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." - } - }, - "fa_icon": "fas fa-chart-line", - "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" - }, - "feature_annotation_statistics": { - "title": "Feature Annotation Statistics", - "type": "object", - "description": "Options for getting reference annotation statistics (e.g. gene coverages)", - "default": "", - "properties": { - "run_bedtools_coverage": { - "type": "boolean", - "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", - "fa_icon": "fas fa-chart-area", - "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" - }, - "anno_file": { - "type": "string", - "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", - "fa_icon": "fas fa-file-signature", - "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" - } - }, - "fa_icon": "fas fa-scroll", - "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" - }, - "bam_trimming": { - "title": "BAM Trimming", - "type": "object", - "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", - "default": "", - "properties": { - "run_trim_bam": { - "type": "boolean", - "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." - }, - "bamutils_clip_double_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_softclip": { - "type": "boolean", - "description": "Turn on using softclip instead of hard masking.", - "fa_icon": "fas fa-paint-roller", - "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" - } - }, - "fa_icon": "fas fa-eraser", - "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" - }, - "genotyping": { - "title": "Genotyping", - "type": "object", - "description": "Options for variant calling.", - "default": "", - "properties": { - "run_genotyping": { - "type": "boolean", - "description": "Turn on genotyping of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." - }, - "genotyping_tool": { - "type": "string", - "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", - "fa_icon": "fas fa-tools", - "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", - "enum": [ - "ug", - "hc", - "freebayes", - "pileupcaller", - "angsd" - ] - }, - "genotyping_source": { - "type": "string", - "default": "raw", - "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", - "fa_icon": "fas fa-faucet", - "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", - "enum": [ - "raw", - "pmd", - "trimmed", - "rescaled" - ] - }, - "gatk_call_conf": { - "type": "integer", - "default": 30, - "description": "Specify GATK phred-scaled confidence threshold.", - "fa_icon": "fas fa-balance-scale-right", - "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" - }, - "gatk_ploidy": { - "type": "integer", - "default": 2, - "description": "Specify GATK organism ploidy.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" - }, - "gatk_downsample": { - "type": "integer", - "default": 250, - "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", - "fa_icon": "fas fa-icicles", - "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" - }, - "gatk_dbsnp": { - "type": "string", - "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", - "fa_icon": "fas fa-marker", - "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" - }, - "gatk_hc_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", - "enum": [ - "EMIT_ALL_ACTIVE_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_hc_emitrefconf": { - "type": "string", - "default": "GVCF", - "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", - "enum": [ - "NONE", - "GVCF", - "BP_RESOLUTION" - ] - }, - "gatk_ug_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", - "enum": [ - "EMIT_ALL_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_ug_genotype_model": { - "type": "string", - "default": "SNP", - "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", - "enum": [ - "SNP", - "INDEL", - "BOTH", - "GENERALPLOIDYSNP", - "GENERALPLOIDYINDEL" - ] - }, - "gatk_ug_keep_realign_bam": { - "type": "boolean", - "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "fa_icon": "fas fa-align-left", - "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." - }, - "gatk_ug_defaultbasequalities": { - "type": "string", - "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", - "fa_icon": "fas fa-undo-alt", - "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" - }, - "freebayes_C": { - "type": "integer", - "default": 1, - "description": "Specify minimum required supporting observations to consider a variant.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" - }, - "freebayes_g": { - "type": "integer", - "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", - "fa_icon": "fab fa-think-peaks", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", - "default": 0 - }, - "freebayes_p": { - "type": "integer", - "default": 2, - "description": "Specify ploidy of sample in FreeBayes.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" - }, - "pileupcaller_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for pileupCaller.", - "fa_icon": "fas fa-bed", - "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" - }, - "pileupcaller_snpfile": { - "type": "string", - "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", - "fa_icon": "fas fa-sliders-h", - "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" - }, - "pileupcaller_method": { - "type": "string", - "default": "randomHaploid", - "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", - "fa_icon": "fas fa-toolbox", - "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", - "enum": [ - "randomHaploid", - "randomDiploid", - "majorityCall" - ] - }, - "pileupcaller_transitions_mode": { - "type": "string", - "default": "AllSites", - "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", - "enum": [ - "AllSites", - "TransitionsMissing", - "SkipTransitions" - ] - }, - "pileupcaller_min_map_quality": { - "type": "integer", - "default": 30, - "description": "The minimum mapping quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." - }, - "pileupcaller_min_base_quality": { - "type": "integer", - "default": 30, - "description": "The minimum base quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." - }, - "angsd_glmodel": { - "type": "string", - "default": "samtools", - "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", - "enum": [ - "samtools", - "gatk", - "soapsnp", - "syk" - ] - }, - "angsd_glformat": { - "type": "string", - "default": "binary", - "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", - "fa_icon": "fas fa-text-height", - "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", - "enum": [ - "text", - "binary", - "binary_three", - "beagle" - ] - }, - "angsd_createfasta": { - "type": "boolean", - "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", - "fa_icon": "fas fa-align-justify", - "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" - }, - "angsd_fastamethod": { - "type": "string", - "default": "random", - "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", - "fa_icon": "fas fa-toolbox", - "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", - "enum": [ - "random", - "common" - ] - }, - "run_bcftools_stats": { - "type": "boolean", - "default": true, - "description": "Turn on bcftools stats generation for VCF based variant calling statistics", - "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", - "fa_icon": "far fa-chart-bar" - } - }, - "fa_icon": "fas fa-sliders-h", - "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." - }, - "consensus_sequence_generation": { - "title": "Consensus Sequence Generation", - "type": "object", - "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", - "default": "", - "properties": { - "run_vcf2genome": { - "type": "boolean", - "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" - }, - "vcf2genome_outfile": { - "type": "string", - "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", - "fa_icon": "fas fa-file-alt", - "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" - }, - "vcf2genome_header": { - "type": "string", - "description": "Specify the header name of the consensus sequence entry within the FASTA file.", - "fa_icon": "fas fa-heading", - "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" - }, - "vcf2genome_minc": { - "type": "integer", - "default": 5, - "description": "Minimum depth coverage required for a call to be included (else N will be called).", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" - }, - "vcf2genome_minq": { - "type": "integer", - "default": 30, - "description": "Minimum genotyping quality of a call to be called. Else N will be called.", - "fa_icon": "fas fa-medal", - "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" - }, - "vcf2genome_minfreq": { - "type": "number", - "default": 0.8, - "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", - "fa_icon": "fas fa-percent", - "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" - } - }, - "fa_icon": "fas fa-handshake", - "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." - }, - "snp_table_generation": { - "title": "SNP Table Generation", - "type": "object", - "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", - "default": "", - "properties": { - "run_multivcfanalyzer": { - "type": "boolean", - "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" - }, - "write_allele_frequencies": { - "type": "boolean", - "description": "Turn on writing write allele frequencies in the SNP table.", - "fa_icon": "fas fa-pen", - "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" - }, - "min_genotype_quality": { - "type": "integer", - "default": 30, - "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", - "fa_icon": "fas fa-medal", - "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" - }, - "min_base_coverage": { - "type": "integer", - "default": 5, - "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" - }, - "min_allele_freq_hom": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" - }, - "min_allele_freq_het": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." - }, - "additional_vcf_files": { - "type": "string", - "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", - "fa_icon": "fas fa-copy", - "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." - }, - "reference_gff_annotations": { - "type": "string", - "default": "NA", - "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", - "fa_icon": "fas fa-file-signature", - "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" - }, - "reference_gff_exclude": { - "type": "string", - "default": "NA", - "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", - "fa_icon": "fas fa-times", - "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" - }, - "snp_eff_results": { - "type": "string", - "default": "NA", - "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", - "fa_icon": "fas fa-magic", - "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." - } - }, - "fa_icon": "fas fa-table", - "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." - }, - "mitochondrial_to_nuclear_ratio": { - "title": "Mitochondrial to Nuclear Ratio", - "type": "object", - "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", - "default": "", - "properties": { - "run_mtnucratio": { - "type": "boolean", - "description": "Turn on mitochondrial to nuclear ratio calculation.", - "fa_icon": "fas fa-balance-scale-left", - "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" - }, - "mtnucratio_header": { - "type": "string", - "default": "MT", - "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", - "fa_icon": "fas fa-heading", - "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" - } - }, - "fa_icon": "fas fa-balance-scale-left", - "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" - }, - "human_sex_determination": { - "title": "Human Sex Determination", - "type": "object", - "description": "Options for the calculation of biological sex of human individuals.", - "default": "", - "properties": { - "run_sexdeterrmine": { - "type": "boolean", - "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", - "fa_icon": "fas fa-transgender-alt", - "help_text": "Specify to run the optional process of sex determination.\n" - }, - "sexdeterrmine_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", - "fa_icon": "fas fa-bed", - "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." - } - }, - "fa_icon": "fas fa-transgender", - "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." - }, - "nuclear_contamination_for_human_dna": { - "title": "Nuclear Contamination for Human DNA", - "type": "object", - "description": "Options for the estimation of contamination of human DNA.", - "default": "", - "properties": { - "run_nuclear_contamination": { - "type": "boolean", - "description": "Turn on nuclear contamination estimation for human reference genomes.", - "fa_icon": "fas fa-power-off", - "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" - }, - "contamination_chrom_name": { - "type": "string", - "default": "X", - "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", - "fa_icon": "fas fa-address-card", - "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." - } - }, - "fa_icon": "fas fa-radiation-alt" - }, - "metagenomic_screening": { - "title": "Metagenomic Screening", - "type": "object", - "description": "Options for metagenomic screening of off-target reads.", - "default": "", - "properties": { - "metagenomic_complexity_filter": { - "type": "boolean", - "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", - "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", - "fa_icon": "fas fa-filter" - }, - "metagenomic_complexity_entropy": { - "type": "number", - "default": 0.3, - "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", - "minimum": 0, - "maximum": 1, - "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", - "fa_icon": "fas fa-percent" - }, - "run_metagenomic_screening": { - "type": "boolean", - "description": "Turn on metagenomic screening module for reference-unmapped reads.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on the metagenomic screening module.\n" - }, - "metagenomic_tool": { - "type": "string", - "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", - "fa_icon": "fas fa-tools", - "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" - }, - "database": { - "type": "string", - "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", - "fa_icon": "fas fa-database", - "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." - }, - "metagenomic_min_support_reads": { - "type": "integer", - "default": 1, - "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" - }, - "percent_identity": { - "type": "integer", - "default": 85, - "description": "Percent identity value threshold for MALT.", - "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" - }, - "malt_mode": { - "type": "string", - "default": "BlastN", - "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", - "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", - "enum": [ - "BlastN", - "BlastP", - "BlastX" - ] - }, - "malt_alignment_mode": { - "type": "string", - "default": "SemiGlobal", - "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", - "enum": [ - "Local", - "SemiGlobal" - ] - }, - "malt_top_percent": { - "type": "integer", - "default": 1, - "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", - "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" - }, - "malt_min_support_mode": { - "type": "string", - "default": "percent", - "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", - "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", - "enum": [ - "percent", - "reads" - ] - }, - "malt_min_support_percent": { - "type": "number", - "default": 0.01, - "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", - "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" - }, - "malt_max_queries": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of queries a read can have for MALT.", - "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" - }, - "malt_memory_mode": { - "type": "string", - "default": "load", - "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", - "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", - "enum": [ - "load", - "page", - "map" - ] - }, - "malt_sam_output": { - "type": "boolean", - "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", - "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" - } - }, - "fa_icon": "fas fa-search", - "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." - }, - "metagenomic_authentication": { - "title": "Metagenomic Authentication", - "type": "object", - "description": "Options for authentication of metagenomic screening performed by MALT.", - "default": "", - "properties": { - "run_maltextract": { - "type": "boolean", - "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" - }, - "maltextract_taxon_list": { - "type": "string", - "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "fa_icon": "fas fa-list-ul", - "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_ncbifiles": { - "type": "string", - "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "fa_icon": "fas fa-database", - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_filter": { - "type": "string", - "default": "def_anc", - "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", - "fa_icon": "fas fa-filter", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", - "enum": [ - "def_anc", - "default", - "ancient", - "scan", - "crawl", - "srna" - ] - }, - "maltextract_toppercent": { - "type": "number", - "default": 0.01, - "description": "Specify percent of top alignments to use.", - "fa_icon": "fas fa-percent", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" - }, - "maltextract_destackingoff": { - "type": "boolean", - "description": "Turn off destacking.", - "fa_icon": "fas fa-align-center", - "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" - }, - "maltextract_downsamplingoff": { - "type": "boolean", - "description": "Turn off downsampling.", - "fa_icon": "fab fa-creative-commons-sampling", - "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" - }, - "maltextract_duplicateremovaloff": { - "type": "boolean", - "description": "Turn off duplicate removal.", - "fa_icon": "fas fa-align-left", - "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" - }, - "maltextract_matches": { - "type": "boolean", - "description": "Turn on exporting alignments of hits in BLAST format.", - "fa_icon": "fas fa-equals", - "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" - }, - "maltextract_megansummary": { - "type": "boolean", - "description": "Turn on export of MEGAN summary files.", - "fa_icon": "fas fa-download", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" - }, - "maltextract_percentidentity": { - "type": "number", - "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", - "default": 85, - "fa_icon": "fas fa-id-card", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" - }, - "maltextract_topalignment": { - "type": "boolean", - "description": "Turn on using top alignments per read after filtering.", - "fa_icon": "fas fa-star-half-alt", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" - } - }, - "fa_icon": "fas fa-tasks", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", + "title": "nf-core/eager pipeline parameters", + "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data, and additional metadata.", + "required": ["input"], + "properties": { + "input": { + "type": "string", + "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", + "fa_icon": "fas fa-dna", + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." + }, + "udg_type": { + "type": "string", + "default": "none", + "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", + "fa_icon": "fas fa-vial", + "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", + "enum": ["none", "half", "full"] + }, + "single_stranded": { + "type": "boolean", + "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", + "fa_icon": "fas fa-minus", + "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" + }, + "single_end": { + "type": "boolean", + "description": "Specifies that the input is single end reads. Not required for TSV input.", + "fa_icon": "fas fa-align-left", + "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" + }, + "colour_chemistry": { + "type": "integer", + "default": 4, + "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", + "fa_icon": "fas fa-palette", + "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." + }, + "bam": { + "type": "boolean", + "description": "Specifies that the input is in BAM format. Not required for TSV input.", + "fa_icon": "fas fa-align-justify", + "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" + } + }, + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." + }, + "input_data_additional_options": { + "title": "Input Data Additional Options", + "type": "object", + "description": "Additional options regarding input data.", + "default": "", + "properties": { + "snpcapture_bed": { + "type": "string", + "fa_icon": "fas fa-magnet", + "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome.", + "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, on-target metrics are automatically generated for you by qualimap." + }, + "run_convertinputbam": { + "type": "boolean", + "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", + "fa_icon": "fas fa-undo-alt", + "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." + } + }, + "fa_icon": "far fa-plus-square" + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "properties": { + "fasta": { + "type": "string", + "fa_icon": "fas fa-font", + "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", + "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" + }, + "genome": { + "type": "string", + "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", + "fa_icon": "fas fa-book", + "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" + }, + "igenomes_base": { + "type": "string", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + }, + "bwa_index": { + "type": "string", + "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", + "fa_icon": "fas fa-address-book", + "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "bt2_index": { + "type": "string", + "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", + "fa_icon": "far fa-address-book", + "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "fasta_index": { + "type": "string", + "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", + "fa_icon": "far fa-bookmark", + "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" + }, + "seq_dict": { + "type": "string", + "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", + "fa_icon": "fas fa-spell-check", + "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" + }, + "large_ref": { + "type": "boolean", + "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", + "fa_icon": "fas fa-mountain", + "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" + }, + "save_reference": { + "type": "boolean", + "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", + "fa_icon": "far fa-save", + "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" + } + }, + "description": "Specify locations of references and optionally, additional pre-made indices", + "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." + }, + "output_options": { + "title": "Output options", + "type": "object", + "description": "Specify where to put output files and optional saving of intermediate files", + "default": "", + "properties": { + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open", + "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "hidden": true, + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"] + } + }, + "fa_icon": "fas fa-cloud-download-alt" + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-question-circle" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "An email address to send a summary email to when the pipeline is completed.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true, + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true, + "help_text": "Set to receive plain-text e-mails instead of HTML formatted." + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true, + "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true, + "help_text": "Set to disable colourful command line output and live life in monochrome." + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "hidden": true, + "description": "Parameter used for checking conda channels to be set correctly." + }, + "schema_ignore_params": { + "type": "string", + "fa_icon": "fas fa-not-equal", + "description": "String to specify ignored parameters for parameter validation", + "hidden": true, + "default": "genomes" + } + }, + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog", + "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", + "fa_icon": "fas fa-users-cog" + }, + "hostnames": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "awsqueue": { + "type": "string", + "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", + "fa_icon": "fab fa-aws" + }, + "awsregion": { + "type": "string", + "default": "eu-west-1", + "description": "The AWS Region for your AWS Batch job to run on", + "fa_icon": "fab fa-aws" + }, + "awscli": { + "type": "string", + "description": "Path to the AWS CLI tool", + "fa_icon": "fab fa-aws" + } + } + }, + "skip_steps": { + "title": "Skip steps", + "type": "object", + "description": "Skip any of the mentioned steps.", + "default": "", + "properties": { + "skip_fastqc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." + }, + "skip_adapterremoval": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." + }, + "skip_preseq": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the computation of library complexity estimation." + }, + "skip_deduplication": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" + }, + "skip_damage_calculation": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" + }, + "skip_qualimap": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" + } + }, + "fa_icon": "fas fa-fast-forward", + "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." + }, + "complexity_filtering": { + "title": "Complexity filtering", + "type": "object", + "description": "Processing of Illumina two-colour chemistry data.", + "default": "", + "properties": { + "complexity_filter_poly_g": { + "type": "boolean", + "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", + "fa_icon": "fas fa-power-off", + "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" + }, + "complexity_filter_poly_g_min": { + "type": "integer", + "default": 10, + "description": "Specify length of poly-g min for clipping to be performed.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" + } + }, + "fa_icon": "fas fa-filter", + "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" + }, + "read_merging_and_adapter_removal": { + "title": "Read merging and adapter removal", + "type": "object", + "description": "Options for adapter clipping and paired-end merging.", + "default": "", + "properties": { + "clip_forward_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "description": "Specify adapter sequence to be clipped off (forward strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" + }, + "clip_reverse_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", + "description": "Specify adapter sequence to be clipped off (reverse strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" + }, + "clip_adapters_list": { + "type": "string", + "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", + "fa_icon": "fas fa-cut", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" + }, + "clip_readlength": { + "type": "integer", + "default": 30, + "description": "Specify read minimum length to be kept for downstream analysis.", + "fa_icon": "fas fa-ruler", + "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" + }, + "clip_min_read_quality": { + "type": "integer", + "default": 20, + "description": "Specify minimum base quality for trimming off bases.", + "fa_icon": "fas fa-medal", + "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" + }, + "min_adap_overlap": { + "type": "integer", + "default": 1, + "description": "Specify minimum adapter overlap required for clipping.", + "fa_icon": "fas fa-hands-helping", + "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" + }, + "skip_collapse": { + "type": "boolean", + "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" + }, + "skip_trim": { + "type": "boolean", + "description": "Skip adapter and quality trimming.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" + }, + "preserve5p": { + "type": "boolean", + "description": "Skip quality base trimming (n, score, window) of 5 prime end.", + "fa_icon": "fas fa-life-ring", + "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" + }, + "mergedonly": { + "type": "boolean", + "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", + "fa_icon": "fas fa-handshake", + "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." + }, + "qualitymax": { + "type": "integer", + "description": "Specify the maximum Phred score used in input FASTQ files", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "default": 41, + "fa_icon": "fas fa-arrow-up" + }, + "run_post_ar_trimming": { + "type": "boolean", + "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", + "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" + }, + "post_ar_trim_front": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of a merged read or R1", + "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" + }, + "post_ar_trim_tail": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of of a merged read or R1", + "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" + }, + "post_ar_trim_front2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of R2", + "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" + }, + "post_ar_trim_tail2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of R2", + "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" + } + }, + "fa_icon": "fas fa-cut", + "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." + }, + "mapping": { + "title": "Read mapping to reference genome", + "type": "object", + "description": "Options for reference-genome mapping", + "default": "", + "properties": { + "mapper": { + "title": "Mapper", + "type": "string", + "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", + "default": "bwaaln", + "fa_icon": "fas fa-layer-group", + "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", + "enum": ["bwaaln", "bwamem", "circularmapper", "bowtie2"] + }, + "bwaalnn": { + "type": "number", + "default": 0.01, + "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" + }, + "bwaalnk": { + "type": "integer", + "default": 2, + "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", + "fa_icon": "fas fa-drafting-compass", + "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" + }, + "bwaalnl": { + "type": "integer", + "default": 1024, + "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" + }, + "bwaalno": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-people-arrows", + "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", + "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" + }, + "circularextension": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only).", + "fa_icon": "fas fa-external-link-alt", + "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" + }, + "circulartarget": { + "type": "string", + "default": "MT", + "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", + "fa_icon": "fas fa-bullseye", + "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" + }, + "circularfilter": { + "type": "boolean", + "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", + "fa_icon": "fas fa-filter", + "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" + }, + "bt2_alignmode": { + "type": "string", + "default": "local", + "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": ["local", "end-to-end"] + }, + "bt2_sensitivity": { + "type": "string", + "default": "sensitive", + "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", + "fa_icon": "fas fa-microscope", + "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": [ + "no-preset", + "very-fast", + "fast", + "sensitive", + "very-sensitive" + ] + }, + "bt2n": { + "type": "integer", + "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", + "default": 0 + }, + "bt2l": { + "type": "integer", + "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", + "default": 0 + }, + "bt2_trim5": { + "type": "integer", + "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", + "default": 0 + }, + "bt2_trim3": { + "type": "integer", + "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", + "default": 0 + }, + "bt2_maxins": { + "type": "integer", + "default": 500, + "fa_icon": "fas fa-exchange-alt", + "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", + "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" + } + }, + "fa_icon": "fas fa-layer-group", + "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" + }, + "host_removal": { + "title": "Removal of Host-Mapped Reads", + "type": "object", + "description": "Options for production of host-read removed FASTQ files for privacy reasons.", + "default": "", + "properties": { + "hostremoval_input_fastq": { + "type": "boolean", + "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", + "fa_icon": "fas fa-power-off", + "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" + }, + "hostremoval_mode": { + "type": "string", + "default": "remove", + "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", + "fa_icon": "fas fa-mask", + "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", + "enum": ["strip", "replace", "remove"] + } + }, + "fa_icon": "fas fa-user-shield", + "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." + }, + "bam_filtering": { + "title": "BAM Filtering", + "type": "object", + "description": "Options for quality filtering and how to deal with off-target unmapped reads.", + "default": "", + "properties": { + "run_bam_filtering": { + "type": "boolean", + "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" + }, + "bam_mapping_quality_threshold": { + "type": "integer", + "description": "Minimum mapping quality for reads filter.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", + "default": 0 + }, + "bam_filter_minreadlength": { + "type": "integer", + "fa_icon": "fas fa-ruler-horizontal", + "description": "Specify minimum read length to be kept after mapping.", + "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", + "default": 0 + }, + "bam_unmapped_type": { + "type": "string", + "default": "discard", + "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", + "fa_icon": "fas fa-trash-alt", + "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", + "enum": ["discard", "keep", "bam", "fastq", "both"] + } + }, + "fa_icon": "fas fa-sort-amount-down", + "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" + }, + "deduplication": { + "title": "DeDuplication", + "type": "object", + "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", + "default": "", + "properties": { + "dedupper": { + "type": "string", + "default": "markduplicates", + "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", + "fa_icon": "fas fa-object-group", + "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", + "enum": ["markduplicates", "dedup"] + }, + "dedup_all_merged": { + "type": "boolean", + "description": "Turn on treating all reads as merged reads.", + "fa_icon": "fas fa-handshake", + "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" + } + }, + "fa_icon": "fas fa-clone", + "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." + }, + "library_complexity_analysis": { + "title": "Library Complexity Analysis", + "type": "object", + "description": "Options for calculating library complexity (i.e. how many unique reads are present).", + "default": "", + "properties": { + "preseq_mode": { + "type": "string", + "default": "c_curve", + "description": "Specify which mode of preseq to run.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", + "enum": ["c_curve", "lc_extrap"] + }, + "preseq_step_size": { + "type": "integer", + "default": 1000, + "description": "Specify the step size of Preseq.", + "fa_icon": "fas fa-shoe-prints", + "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" + }, + "preseq_maxextrap": { + "type": "integer", + "default": 10000000000, + "description": "Specify the maximum extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-ban", + "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" + }, + "preseq_terms": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" + }, + "preseq_bootstrap": { + "type": "integer", + "default": 100, + "description": "Specify number of bootstraps to perform (lc_extrap mode only)", + "fa_icon": "fab fa-bootstrap", + "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" + }, + "preseq_cval": { + "type": "number", + "default": 0.95, + "description": "Specify confidence interval level (lc_extrap mode only)", + "fa_icon": "fas fa-check-circle", + "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" + } + }, + "fa_icon": "fas fa-bezier-curve", + "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." + }, + "adna_damage_analysis": { + "title": "(aDNA) Damage Analysis", + "type": "object", + "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", + "default": "", + "properties": { + "damageprofiler_length": { + "type": "integer", + "default": 100, + "description": "Specify length filter for DamageProfiler.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" + }, + "damageprofiler_threshold": { + "type": "integer", + "default": 15, + "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" + }, + "damageprofiler_yaxis": { + "type": "number", + "default": 0.3, + "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", + "fa_icon": "fas fa-ruler-vertical", + "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" + }, + "run_pmdtools": { + "type": "boolean", + "description": "Turn on PMDtools", + "fa_icon": "fas fa-power-off", + "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" + }, + "pmdtools_range": { + "type": "integer", + "default": 10, + "description": "Specify range of bases for PMDTools to scan for damage.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" + }, + "pmdtools_threshold": { + "type": "integer", + "default": 3, + "description": "Specify PMDScore threshold for PMDTools.", + "fa_icon": "fas fa-chart-bar", + "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" + }, + "pmdtools_reference_mask": { + "type": "string", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", + "fa_icon": "fas fa-mask", + "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." + }, + "pmdtools_max_reads": { + "type": "integer", + "default": 10000, + "description": "Specify the maximum number of reads to consider for metrics generation.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" + }, + "pmdtools_platypus": { + "type": "boolean", + "description": "Append big list of base frequencies for platypus to output.", + "fa_icon": "fas fa-power-off", + "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" + }, + "run_mapdamage_rescaling": { + "type": "boolean", + "fa_icon": "fas fa-map", + "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" + }, + "rescale_length_5p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-right", + "description": "Length of read for mapDamage2 to rescale from 5p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." + }, + "rescale_length_3p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-left", + "description": "Length of read for mapDamage2 to rescale from 3p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." + } + }, + "fa_icon": "fas fa-chart-line", + "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" + }, + "feature_annotation_statistics": { + "title": "Feature Annotation Statistics", + "type": "object", + "description": "Options for getting reference annotation statistics (e.g. gene coverages)", + "default": "", + "properties": { + "run_bedtools_coverage": { + "type": "boolean", + "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", + "fa_icon": "fas fa-chart-area", + "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" + }, + "anno_file": { + "type": "string", + "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", + "fa_icon": "fas fa-file-signature", + "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" + } + }, + "fa_icon": "fas fa-scroll", + "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" + }, + "bam_trimming": { + "title": "BAM Trimming", + "type": "object", + "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", + "default": "", + "properties": { + "run_trim_bam": { + "type": "boolean", + "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." + }, + "bamutils_clip_double_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_softclip": { + "type": "boolean", + "description": "Turn on using softclip instead of hard masking.", + "fa_icon": "fas fa-paint-roller", + "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" + } + }, + "fa_icon": "fas fa-eraser", + "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" + }, + "genotyping": { + "title": "Genotyping", + "type": "object", + "description": "Options for variant calling.", + "default": "", + "properties": { + "run_genotyping": { + "type": "boolean", + "description": "Turn on genotyping of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." + }, + "genotyping_tool": { + "type": "string", + "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", + "fa_icon": "fas fa-tools", + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", + "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"] + }, + "genotyping_source": { + "type": "string", + "default": "raw", + "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", + "fa_icon": "fas fa-faucet", + "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", + "enum": ["raw", "pmd", "trimmed", "rescaled"] + }, + "gatk_call_conf": { + "type": "integer", + "default": 30, + "description": "Specify GATK phred-scaled confidence threshold.", + "fa_icon": "fas fa-balance-scale-right", + "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + }, + "gatk_ploidy": { + "type": "integer", + "default": 2, + "description": "Specify GATK organism ploidy.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" + }, + "gatk_downsample": { + "type": "integer", + "default": 250, + "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", + "fa_icon": "fas fa-icicles", + "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + }, + "gatk_dbsnp": { + "type": "string", + "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", + "fa_icon": "fas fa-marker", + "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" + }, + "gatk_hc_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", + "enum": [ + "EMIT_ALL_ACTIVE_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_hc_emitrefconf": { + "type": "string", + "default": "GVCF", + "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", + "enum": ["NONE", "GVCF", "BP_RESOLUTION"] + }, + "gatk_ug_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "enum": [ + "EMIT_ALL_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_ug_genotype_model": { + "type": "string", + "default": "SNP", + "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", + "enum": [ + "SNP", + "INDEL", + "BOTH", + "GENERALPLOIDYSNP", + "GENERALPLOIDYINDEL" + ] + }, + "gatk_ug_keep_realign_bam": { + "type": "boolean", + "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", + "fa_icon": "fas fa-align-left", + "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + }, + "gatk_ug_defaultbasequalities": { + "type": "string", + "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", + "fa_icon": "fas fa-undo-alt", + "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" + }, + "freebayes_C": { + "type": "integer", + "default": 1, + "description": "Specify minimum required supporting observations to consider a variant.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" + }, + "freebayes_g": { + "type": "integer", + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", + "fa_icon": "fab fa-think-peaks", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", + "default": 0 + }, + "freebayes_p": { + "type": "integer", + "default": 2, + "description": "Specify ploidy of sample in FreeBayes.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" + }, + "pileupcaller_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for pileupCaller.", + "fa_icon": "fas fa-bed", + "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" + }, + "pileupcaller_snpfile": { + "type": "string", + "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", + "fa_icon": "fas fa-sliders-h", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" + }, + "pileupcaller_method": { + "type": "string", + "default": "randomHaploid", + "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", + "fa_icon": "fas fa-toolbox", + "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", + "enum": ["randomHaploid", "randomDiploid", "majorityCall"] + }, + "pileupcaller_transitions_mode": { + "type": "string", + "default": "AllSites", + "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", + "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"] + }, + "pileupcaller_min_map_quality": { + "type": "integer", + "default": 30, + "description": "The minimum mapping quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." + }, + "pileupcaller_min_base_quality": { + "type": "integer", + "default": 30, + "description": "The minimum base quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." + }, + "angsd_glmodel": { + "type": "string", + "default": "samtools", + "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", + "enum": ["samtools", "gatk", "soapsnp", "syk"] + }, + "angsd_glformat": { + "type": "string", + "default": "binary", + "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", + "fa_icon": "fas fa-text-height", + "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", + "enum": ["text", "binary", "binary_three", "beagle"] + }, + "angsd_createfasta": { + "type": "boolean", + "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", + "fa_icon": "fas fa-align-justify", + "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" + }, + "angsd_fastamethod": { + "type": "string", + "default": "random", + "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", + "fa_icon": "fas fa-toolbox", + "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", + "enum": ["random", "common"] + }, + "run_bcftools_stats": { + "type": "boolean", + "default": true, + "description": "Turn on bcftools stats generation for VCF based variant calling statistics", + "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", + "fa_icon": "far fa-chart-bar" + } + }, + "fa_icon": "fas fa-sliders-h", + "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." + }, + "consensus_sequence_generation": { + "title": "Consensus Sequence Generation", + "type": "object", + "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", + "default": "", + "properties": { + "run_vcf2genome": { + "type": "boolean", + "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" + }, + "vcf2genome_outfile": { + "type": "string", + "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", + "fa_icon": "fas fa-file-alt", + "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" + }, + "vcf2genome_header": { + "type": "string", + "description": "Specify the header name of the consensus sequence entry within the FASTA file.", + "fa_icon": "fas fa-heading", + "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" + }, + "vcf2genome_minc": { + "type": "integer", + "default": 5, + "description": "Minimum depth coverage required for a call to be included (else N will be called).", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" + }, + "vcf2genome_minq": { + "type": "integer", + "default": 30, + "description": "Minimum genotyping quality of a call to be called. Else N will be called.", + "fa_icon": "fas fa-medal", + "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" + }, + "vcf2genome_minfreq": { + "type": "number", + "default": 0.8, + "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", + "fa_icon": "fas fa-percent", + "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" } + }, + "fa_icon": "fas fa-handshake", + "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" + "snp_table_generation": { + "title": "SNP Table Generation", + "type": "object", + "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", + "default": "", + "properties": { + "run_multivcfanalyzer": { + "type": "boolean", + "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" }, - { - "$ref": "#/definitions/input_data_additional_options" + "write_allele_frequencies": { + "type": "boolean", + "description": "Turn on writing write allele frequencies in the SNP table.", + "fa_icon": "fas fa-pen", + "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" }, - { - "$ref": "#/definitions/reference_genome_options" + "min_genotype_quality": { + "type": "integer", + "default": 30, + "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", + "fa_icon": "fas fa-medal", + "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" }, - { - "$ref": "#/definitions/output_options" + "min_base_coverage": { + "type": "integer", + "default": 5, + "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" }, - { - "$ref": "#/definitions/generic_options" + "min_allele_freq_hom": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" }, - { - "$ref": "#/definitions/max_job_request_options" + "min_allele_freq_het": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." }, - { - "$ref": "#/definitions/institutional_config_options" + "additional_vcf_files": { + "type": "string", + "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", + "fa_icon": "fas fa-copy", + "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." }, - { - "$ref": "#/definitions/skip_steps" + "reference_gff_annotations": { + "type": "string", + "default": "NA", + "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", + "fa_icon": "fas fa-file-signature", + "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" }, - { - "$ref": "#/definitions/complexity_filtering" + "reference_gff_exclude": { + "type": "string", + "default": "NA", + "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", + "fa_icon": "fas fa-times", + "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" }, - { - "$ref": "#/definitions/read_merging_and_adapter_removal" + "snp_eff_results": { + "type": "string", + "default": "NA", + "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", + "fa_icon": "fas fa-magic", + "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." + } + }, + "fa_icon": "fas fa-table", + "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." + }, + "mitochondrial_to_nuclear_ratio": { + "title": "Mitochondrial to Nuclear Ratio", + "type": "object", + "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", + "default": "", + "properties": { + "run_mtnucratio": { + "type": "boolean", + "description": "Turn on mitochondrial to nuclear ratio calculation.", + "fa_icon": "fas fa-balance-scale-left", + "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" }, - { - "$ref": "#/definitions/mapping" + "mtnucratio_header": { + "type": "string", + "default": "MT", + "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", + "fa_icon": "fas fa-heading", + "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" + } + }, + "fa_icon": "fas fa-balance-scale-left", + "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" + }, + "human_sex_determination": { + "title": "Human Sex Determination", + "type": "object", + "description": "Options for the calculation of biological sex of human individuals.", + "default": "", + "properties": { + "run_sexdeterrmine": { + "type": "boolean", + "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", + "fa_icon": "fas fa-transgender-alt", + "help_text": "Specify to run the optional process of sex determination.\n" + }, + "sexdeterrmine_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", + "fa_icon": "fas fa-bed", + "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." + } + }, + "fa_icon": "fas fa-transgender", + "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." + }, + "nuclear_contamination_for_human_dna": { + "title": "Nuclear Contamination for Human DNA", + "type": "object", + "description": "Options for the estimation of contamination of human DNA.", + "default": "", + "properties": { + "run_nuclear_contamination": { + "type": "boolean", + "description": "Turn on nuclear contamination estimation for human reference genomes.", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" }, - { - "$ref": "#/definitions/host_removal" + "contamination_chrom_name": { + "type": "string", + "default": "X", + "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", + "fa_icon": "fas fa-address-card", + "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." + } + }, + "fa_icon": "fas fa-radiation-alt" + }, + "metagenomic_screening": { + "title": "Metagenomic Screening", + "type": "object", + "description": "Options for metagenomic screening of off-target reads.", + "default": "", + "properties": { + "metagenomic_complexity_filter": { + "type": "boolean", + "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", + "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", + "fa_icon": "fas fa-filter" }, - { - "$ref": "#/definitions/bam_filtering" + "metagenomic_complexity_entropy": { + "type": "number", + "default": 0.3, + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "minimum": 0, + "maximum": 1, + "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", + "fa_icon": "fas fa-percent" }, - { - "$ref": "#/definitions/deduplication" + "run_metagenomic_screening": { + "type": "boolean", + "description": "Turn on metagenomic screening module for reference-unmapped reads.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on the metagenomic screening module.\n" }, - { - "$ref": "#/definitions/library_complexity_analysis" + "metagenomic_tool": { + "type": "string", + "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", + "fa_icon": "fas fa-tools", + "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" }, - { - "$ref": "#/definitions/adna_damage_analysis" + "database": { + "type": "string", + "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", + "fa_icon": "fas fa-database", + "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." }, - { - "$ref": "#/definitions/feature_annotation_statistics" + "metagenomic_min_support_reads": { + "type": "integer", + "default": 1, + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" }, - { - "$ref": "#/definitions/bam_trimming" + "percent_identity": { + "type": "integer", + "default": 85, + "description": "Percent identity value threshold for MALT.", + "fa_icon": "fas fa-id-card", + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" }, - { - "$ref": "#/definitions/genotyping" + "malt_mode": { + "type": "string", + "default": "BlastN", + "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "fa_icon": "fas fa-align-left", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "enum": ["BlastN", "BlastP", "BlastX"] }, - { - "$ref": "#/definitions/consensus_sequence_generation" + "malt_alignment_mode": { + "type": "string", + "default": "SemiGlobal", + "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "enum": ["Local", "SemiGlobal"] }, - { - "$ref": "#/definitions/snp_table_generation" + "malt_top_percent": { + "type": "integer", + "default": 1, + "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", + "fa_icon": "fas fa-percent", + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" }, - { - "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" + "malt_min_support_mode": { + "type": "string", + "default": "percent", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "fa_icon": "fas fa-drumstick-bite", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "enum": ["percent", "reads"] }, - { - "$ref": "#/definitions/human_sex_determination" + "malt_min_support_percent": { + "type": "number", + "default": 0.01, + "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", + "fa_icon": "fas fa-percentage", + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" }, - { - "$ref": "#/definitions/nuclear_contamination_for_human_dna" + "malt_max_queries": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of queries a read can have for MALT.", + "fa_icon": "fas fa-phone", + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" }, - { - "$ref": "#/definitions/metagenomic_screening" + "malt_memory_mode": { + "type": "string", + "default": "load", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "fa_icon": "fas fa-memory", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "enum": ["load", "page", "map"] }, - { - "$ref": "#/definitions/metagenomic_authentication" + "malt_sam_output": { + "type": "boolean", + "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", + "fa_icon": "fas fa-file-alt", + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" } - ] -} \ No newline at end of file + }, + "fa_icon": "fas fa-search", + "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." + }, + "metagenomic_authentication": { + "title": "Metagenomic Authentication", + "type": "object", + "description": "Options for authentication of metagenomic screening performed by MALT.", + "default": "", + "properties": { + "run_maltextract": { + "type": "boolean", + "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" + }, + "maltextract_taxon_list": { + "type": "string", + "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", + "fa_icon": "fas fa-list-ul", + "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_ncbifiles": { + "type": "string", + "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", + "fa_icon": "fas fa-database", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_filter": { + "type": "string", + "default": "def_anc", + "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", + "fa_icon": "fas fa-filter", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", + "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] + }, + "maltextract_toppercent": { + "type": "number", + "default": 0.01, + "description": "Specify percent of top alignments to use.", + "fa_icon": "fas fa-percent", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" + }, + "maltextract_destackingoff": { + "type": "boolean", + "description": "Turn off destacking.", + "fa_icon": "fas fa-align-center", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" + }, + "maltextract_downsamplingoff": { + "type": "boolean", + "description": "Turn off downsampling.", + "fa_icon": "fab fa-creative-commons-sampling", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" + }, + "maltextract_duplicateremovaloff": { + "type": "boolean", + "description": "Turn off duplicate removal.", + "fa_icon": "fas fa-align-left", + "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" + }, + "maltextract_matches": { + "type": "boolean", + "description": "Turn on exporting alignments of hits in BLAST format.", + "fa_icon": "fas fa-equals", + "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" + }, + "maltextract_megansummary": { + "type": "boolean", + "description": "Turn on export of MEGAN summary files.", + "fa_icon": "fas fa-download", + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" + }, + "maltextract_percentidentity": { + "type": "number", + "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", + "default": 85, + "fa_icon": "fas fa-id-card", + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" + }, + "maltextract_topalignment": { + "type": "boolean", + "description": "Turn on using top alignments per read after filtering.", + "fa_icon": "fas fa-star-half-alt", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" + } + }, + "fa_icon": "fas fa-tasks", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/input_data_additional_options" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/output_options" + }, + { + "$ref": "#/definitions/generic_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/skip_steps" + }, + { + "$ref": "#/definitions/complexity_filtering" + }, + { + "$ref": "#/definitions/read_merging_and_adapter_removal" + }, + { + "$ref": "#/definitions/mapping" + }, + { + "$ref": "#/definitions/host_removal" + }, + { + "$ref": "#/definitions/bam_filtering" + }, + { + "$ref": "#/definitions/deduplication" + }, + { + "$ref": "#/definitions/library_complexity_analysis" + }, + { + "$ref": "#/definitions/adna_damage_analysis" + }, + { + "$ref": "#/definitions/feature_annotation_statistics" + }, + { + "$ref": "#/definitions/bam_trimming" + }, + { + "$ref": "#/definitions/genotyping" + }, + { + "$ref": "#/definitions/consensus_sequence_generation" + }, + { + "$ref": "#/definitions/snp_table_generation" + }, + { + "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" + }, + { + "$ref": "#/definitions/human_sex_determination" + }, + { + "$ref": "#/definitions/nuclear_contamination_for_human_dna" + }, + { + "$ref": "#/definitions/metagenomic_screening" + }, + { + "$ref": "#/definitions/metagenomic_authentication" + } + ] +} From f6ebee3cc68522411b39ed127e7212586b14eb1e Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:18:25 +0200 Subject: [PATCH 05/15] Fix markdown linting --- docs/usage.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 683dacfba..c73f50292 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -530,7 +530,7 @@ and investigate the log and error messages that are produced by each command of the process. For example, in the error in -[1a](#1a-Nextflow-reports-an-error-executing-process-with-command-error) you can +[1a](#1a-nextflow-reports-an-error-executing-process-with-command-error) you can see the following line ```bash @@ -1443,7 +1443,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Human Pop-Gen - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -1699,7 +1699,7 @@ each `Lane`, but the `Sample_Name` and `Library_ID` columns identify and group them together accordingly. Secondly, as we have NextSeq data, we have specified we have `2` for `Colour_Chemistry`, which is important for downstream processing (see below). The other columns are less important for this particular context of -metagenomic screening. See the nf-core/eager [usage](#pipeline-options) +metagenomic screening. See the nf-core/eager [the parameters documentation](https://nf-co.re/eager/parameters) documentation for more specifications on how to set up a TSV file (e.g. why despite NextSeqs only having 4 lanes, we go up to 8 in the example above). @@ -1802,7 +1802,7 @@ nextflow run nf-core/eager \ nf-core/eager will now take all unmapped reads after mapping and convert the BAM file back to FASTQ, which can be accepted by MALT. But of course, we also then need to tell nf-core/eager we actually want to run MALT. We will also specify -the location of the [pre-built database](#preparation) and which 'min support' +the location of the [pre-built database](#tutorial-metagenomics---preparation) and which 'min support' method we want to use (this specifies the minimum number of alignments is needed to a particular taxonomic node to be 'kept' in the MALT output files). Otherwise we will keep all other parameters as default. For example using BlastN mode, @@ -1874,7 +1874,7 @@ Porphyromonas ``` We have also specified the path to the HOPS resources [downloaded -earlier](#preparation), and that I want to turn off 'destacking' (removal of any +earlier](#tutorial-metagenomics---preparation), and that I want to turn off 'destacking' (removal of any read that overlaps the positions of another - something only recommended to keep on when you have high coverage data). @@ -1885,7 +1885,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Metagenomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [usage](#pipeline-options) that all parameters are as expected, or check +against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -2511,7 +2511,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Pathogen Genomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. From f5f805cb8efe5d565cd4aecdc5b5b9066c2d3c86 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:23:15 +0200 Subject: [PATCH 06/15] Revert nf-core non-standard linting files but still template --- .github/PULL_REQUEST_TEMPLATE.md | 24 ++-- .github/markdownlint.yml | 19 +-- .github/workflows/branch.yml | 9 +- .github/workflows/linting_comment.yml | 1 + assets/email_template.html | 169 +++++++------------------- 5 files changed, 65 insertions(+), 157 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6f09b12b6..ecd0403f7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,27 +1,17 @@ - ## PR checklist -- [ ] This comment contains a description of changes (with reason). -- [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. -- [ ] Make sure your code lints (`nf-core lint .`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). -- [ ] Usage Documentation in `docs/usage.md` is updated. -- [ ] Output Documentation in `docs/output.md` is updated. -- [ ] `CHANGELOG.md` is updated. -- [ ] `README.md` is updated (including new tool citations and authors/contributors). + - [ ] This comment contains a description of changes (with reason) + - [ ] `CHANGELOG.md` is updated + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] Documentation in `docs` is updated \ No newline at end of file diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index 24989492d..2e6435878 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -2,11 +2,16 @@ default: true line-length: false no-duplicate-header: - siblings_only: true + siblings_only: true no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary + allowed_elements: + - img + - p + - kbd + - details + - summary + - kbd +# tools only - the {{ jinja variables }} break URLs and cause this to error +no-bare-urls: false +# tools only - suppresses error messages for usage of $ in main README +commands-show-output: false \ No newline at end of file diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 5b39a40ed..30cf891cc 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -14,7 +14,6 @@ jobs: if: github.repository == 'nf-core/eager' run: | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/eager ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] - # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment @@ -23,22 +22,16 @@ jobs: with: message: | ## This PR is against the `master` branch :x: - * Do not close this PR * Click _Edit_ and change the `base` to `dev` * This CI test will remain failed until you push a new commit - --- - Hi @${{ github.event.pull_request.user.login }}, - It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. The `master` branch on nf-core repositories should always contain code from the latest release. Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. - You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. - Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false + allow-repeats: false \ No newline at end of file diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 0471addcc..d616b6d51 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,3 +1,4 @@ + name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork diff --git a/assets/email_template.html b/assets/email_template.html index 1e4a996f1..68825bb06 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,134 +1,53 @@ - - - - + + + + - - Codestin Search App - - -

- + + Codestin Search App + + +
-

nf-core/eager v${version}

-

Run Name: $runName

+ - <% if (!success){ out << """ -
-

- nf-core/eager execution completed unsuccessfully! -

-

- The exit status of the task that caused the workflow execution to fail - was: $exitStatus. -

+

nf-core/eager v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/eager execution completed unsuccessfully!

+

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
-${errorReport}
-
- """ } else { out << """ -
+
${errorReport}
+
+ """ +} else { + out << """ +
nf-core/eager execution completed successfully! -
- """ } %> +
+ """ +} +%> -

- The workflow was completed at $dateComplete (duration: - $duration) -

-

The command used to launch the workflow was as follows:

-
-$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - - - " }.join("\n") %> - -
- $k - -
$v
-
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
-

nf-core/eager

-

- https://github.com/nf-core/eager -

-
- - +

nf-core/eager

+

https://github.com/nf-core/eager

+ +
+ + + \ No newline at end of file From cedb95001820c755563e797a768791e27aa3baa5 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:25:28 +0200 Subject: [PATCH 07/15] Revert "Revert nf-core non-standard linting files but still template" This reverts commit f5f805cb8efe5d565cd4aecdc5b5b9066c2d3c86. --- .github/PULL_REQUEST_TEMPLATE.md | 24 ++-- .github/markdownlint.yml | 19 ++- .github/workflows/branch.yml | 9 +- .github/workflows/linting_comment.yml | 1 - assets/email_template.html | 169 +++++++++++++++++++------- 5 files changed, 157 insertions(+), 65 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ecd0403f7..6f09b12b6 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,17 +1,27 @@ + ## PR checklist - - [ ] This comment contains a description of changes (with reason) - - [ ] `CHANGELOG.md` is updated - - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] Documentation in `docs` is updated \ No newline at end of file +- [ ] This comment contains a description of changes (with reason). +- [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] Make sure your code lints (`nf-core lint .`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Usage Documentation in `docs/usage.md` is updated. +- [ ] Output Documentation in `docs/output.md` is updated. +- [ ] `CHANGELOG.md` is updated. +- [ ] `README.md` is updated (including new tool citations and authors/contributors). diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index 2e6435878..24989492d 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -2,16 +2,11 @@ default: true line-length: false no-duplicate-header: - siblings_only: true + siblings_only: true no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary - - kbd -# tools only - the {{ jinja variables }} break URLs and cause this to error -no-bare-urls: false -# tools only - suppresses error messages for usage of $ in main README -commands-show-output: false \ No newline at end of file + allowed_elements: + - img + - p + - kbd + - details + - summary diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 30cf891cc..5b39a40ed 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -14,6 +14,7 @@ jobs: if: github.repository == 'nf-core/eager' run: | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/eager ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment @@ -22,16 +23,22 @@ jobs: with: message: | ## This PR is against the `master` branch :x: + * Do not close this PR * Click _Edit_ and change the `base` to `dev` * This CI test will remain failed until you push a new commit + --- + Hi @${{ github.event.pull_request.user.login }}, + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. The `master` branch on nf-core repositories should always contain code from the latest release. Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. Note that even after this, the test will continue to show as failing until you push a new commit. + Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false \ No newline at end of file + allow-repeats: false diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index d616b6d51..0471addcc 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,4 +1,3 @@ - name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork diff --git a/assets/email_template.html b/assets/email_template.html index 68825bb06..1e4a996f1 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,53 +1,134 @@ - - - - + + + + - - Codestin Search App - - -
+ + Codestin Search App + + +
+ - +

nf-core/eager v${version}

+

Run Name: $runName

-

nf-core/eager v${version}

-

Run Name: $runName

- -<% if (!success){ - out << """ -
-

nf-core/eager execution completed unsuccessfully!

-

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

+ <% if (!success){ out << """ +
+

+ nf-core/eager execution completed unsuccessfully! +

+

+ The exit status of the task that caused the workflow execution to fail + was: $exitStatus. +

The full error message was:

-
${errorReport}
-
- """ -} else { - out << """ -
+
+${errorReport}
+
+ """ } else { out << """ +
nf-core/eager execution completed successfully! -
- """ -} -%> +
+ """ } %> -

The workflow was completed at $dateComplete (duration: $duration)

-

The command used to launch the workflow was as follows:

-
$commandLine
+

+ The workflow was completed at $dateComplete (duration: + $duration) +

+

The command used to launch the workflow was as follows:

+
+$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> "" }.join("\n") %> - -
$k
$v
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> " + + + + + " }.join("\n") %> + +
+ $k + +
$v
+
-

nf-core/eager

-

https://github.com/nf-core/eager

- -
- - - \ No newline at end of file +

nf-core/eager

+

+ https://github.com/nf-core/eager +

+
+ + From 0a0b4b372340e584c0121b1322a7b4b0317850e0 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:25:34 +0200 Subject: [PATCH 08/15] Revert "Fix markdown linting" This reverts commit f6ebee3cc68522411b39ed127e7212586b14eb1e. --- docs/usage.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index c73f50292..683dacfba 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -530,7 +530,7 @@ and investigate the log and error messages that are produced by each command of the process. For example, in the error in -[1a](#1a-nextflow-reports-an-error-executing-process-with-command-error) you can +[1a](#1a-Nextflow-reports-an-error-executing-process-with-command-error) you can see the following line ```bash @@ -1443,7 +1443,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Human Pop-Gen - Results Assuming the run completed without any crashes (if problems do occur, check -against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or +against [#usage](#pipeline-options) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -1699,7 +1699,7 @@ each `Lane`, but the `Sample_Name` and `Library_ID` columns identify and group them together accordingly. Secondly, as we have NextSeq data, we have specified we have `2` for `Colour_Chemistry`, which is important for downstream processing (see below). The other columns are less important for this particular context of -metagenomic screening. See the nf-core/eager [the parameters documentation](https://nf-co.re/eager/parameters) +metagenomic screening. See the nf-core/eager [usage](#pipeline-options) documentation for more specifications on how to set up a TSV file (e.g. why despite NextSeqs only having 4 lanes, we go up to 8 in the example above). @@ -1802,7 +1802,7 @@ nextflow run nf-core/eager \ nf-core/eager will now take all unmapped reads after mapping and convert the BAM file back to FASTQ, which can be accepted by MALT. But of course, we also then need to tell nf-core/eager we actually want to run MALT. We will also specify -the location of the [pre-built database](#tutorial-metagenomics---preparation) and which 'min support' +the location of the [pre-built database](#preparation) and which 'min support' method we want to use (this specifies the minimum number of alignments is needed to a particular taxonomic node to be 'kept' in the MALT output files). Otherwise we will keep all other parameters as default. For example using BlastN mode, @@ -1874,7 +1874,7 @@ Porphyromonas ``` We have also specified the path to the HOPS resources [downloaded -earlier](#tutorial-metagenomics---preparation), and that I want to turn off 'destacking' (removal of any +earlier](#preparation), and that I want to turn off 'destacking' (removal of any read that overlaps the positions of another - something only recommended to keep on when you have high coverage data). @@ -1885,7 +1885,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Metagenomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or check +against [usage](#pipeline-options) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -2511,7 +2511,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Pathogen Genomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [the parameters documentation](https://nf-co.re/eager/parameters) that all parameters are as expected, or +against [#usage](#pipeline-options) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. From 687eb7fdff7a18b34992db4ee88d1fb85f6b557b Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:25:57 +0200 Subject: [PATCH 09/15] Revert "prettier fixes" This reverts commit 99046cb0faa2d0bd9c5affe8190e98b483d245ba. --- .github/CONTRIBUTING.md | 60 +- .github/ISSUE_TEMPLATE/bug_report.md | 3 +- .github/PULL_REQUEST_TEMPLATE.md | 6 +- .../pull_request_template.md | 20 +- .github/markdownlint.yml | 14 +- .github/workflows/awsfulltest.yml | 2 + .github/workflows/awstest.yml | 2 + .github/workflows/branch.yml | 2 + .github/workflows/ci.yml | 16 +- .github/workflows/linting.yml | 8 +- .github/workflows/linting_comment.yml | 2 + README.md | 221 +- assets/angsd_resources/README | 15 +- assets/email_template.html | 167 +- assets/multiqc_config.yaml | 592 +-- docs/README.md | 27 +- docs/output.md | 296 +- docs/usage.md | 338 +- nextflow_schema.json | 3416 +++++++++-------- 19 files changed, 2620 insertions(+), 2587 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index fc6028ac7..75b61b9ff 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -16,7 +16,7 @@ Contributions to the code are even more welcome ;) If you'd like to write some code for nf-core/eager, the standard workflow is as follows: 1. Check that there isn't already an issue about your idea in the [nf-core/eager issues](https://github.com/nf-core/eager/issues) to avoid duplicating work - - If there isn't one already, please create one so that others know you're working on this + * If there isn't one already, please create one so that others know you're working on this 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/eager repository](https://github.com/nf-core/eager) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build .` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). @@ -49,9 +49,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -- On your own fork, make a new branch `patch` based on `upstream/master`. -- Fix the bug, and bump version (X.Y.Z+1). -- A PR should be made on `master` from patch to directly this particular bug. +* On your own fork, make a new branch `patch` based on `upstream/master`. +* Fix the bug, and bump version (X.Y.Z+1). +* A PR should be made on `master` from patch to directly this particular bug. ## Getting help @@ -96,9 +96,9 @@ The process resources can be passed on to the tool dynamically within the proces Please use the following naming schemes, to make it easy to understand what is going where. -- initial process channel: `ch_output_from_` -- intermediate and terminal channels: `ch__for_` -- skipped process output: `ch__for_`(this goes out of the bypass statement described above) +* initial process channel: `ch_output_from_` +* intermediate and terminal channels: `ch__for_` +* skipped process output: `ch__for_`(this goes out of the bypass statement described above) ### Nextflow version bumping @@ -135,18 +135,18 @@ For all internal nf-core/eager documentation images we are using the 'Kalam' fon We are providing a highly configurable pipeline, with many options to turn on and off different processes in different combinations. This can make a very complex graph structure that can cause a large amount of duplicated channels coming out of every process to account for each possible combination. -The EAGER pipeline can currently be broken down into the following 'stages', where a stage is a collection of non-terminal mutually exclusive processes, which is the output of which is used for another file reporting module (but not reporting!) . +The EAGER pipeline can currently be broken down into the following 'stages', where a stage is a collection of non-terminal mutually exclusive processes, which is the output of which is used for another file reporting module (but not reporting!) . -- Input -- Convert BAM -- PolyG Clipping -- AdapterRemoval -- Mapping (either `bwa`, `bwamem`, or `circularmapper`) -- BAM Filtering -- Deduplication (either `dedup` or `markduplicates`) -- BAM Trimming -- PMDtools -- Genotyping +* Input +* Convert BAM +* PolyG Clipping +* AdapterRemoval +* Mapping (either `bwa`, `bwamem`, or `circularmapper`) +* BAM Filtering +* Deduplication (either `dedup` or `markduplicates`) +* BAM Trimming +* PMDtools +* Genotyping Every step can potentially be skipped, therefore the output of a previous stage must be able to be passed to the next stage, if the given stage is not run. @@ -154,16 +154,16 @@ To somewhat simplify this logic, we have implemented the following structure. The concept is as follows: -- Every 'stage' of the pipeline (i.e. collection of mutually exclusive processes) must always have a if else statement following it. -- This if else 'bypass' statement collects and standardises all possible input files into single channel(s) for the next stage. -- Importantly - within the bypass statement, a channel from the previous stage's bypass mixes into these output channels. This additional channel is named `ch_previousstage_for_skipcurrentstage`. This contains the output from the previous stage, i.e. not the modified version from the current stage. -- The bypass statement works as follows: - - If the current stage is turned on: will mix the previous stage and current stage output and filter for file suffixes unique to the current stage output - - If the current stage is turned off or skipped: will mix the previous stage and current stage output. However as there there is no files in the output channel from the current stage, no filtering is required and the files in the 'ch_XXX_for_skipXXX' stage will be used. - -This ensures the same channel inputs to the next stage is 'homogeneous' - i.e. all comes from the same source (the bypass statement) - -An example schematic can be given as follows +* Every 'stage' of the pipeline (i.e. collection of mutually exclusive processes) must always have a if else statement following it. +* This if else 'bypass' statement collects and standardises all possible input files into single channel(s) for the next stage. +* Importantly - within the bypass statement, a channel from the previous stage's bypass mixes into these output channels. This additional channel is named `ch_previousstage_for_skipcurrentstage`. This contains the output from the previous stage, i.e. not the modified version from the current stage. +* The bypass statement works as follows: + * If the current stage is turned on: will mix the previous stage and current stage output and filter for file suffixes unique to the current stage output + * If the current stage is turned off or skipped: will mix the previous stage and current stage output. However as there there is no files in the output channel from the current stage, no filtering is required and the files in the 'ch_XXX_for_skipXXX' stage will be used. + + This ensures the same channel inputs to the next stage is 'homogeneous' - i.e. all comes from the same source (the bypass statement) + + An example schematic can be given as follows ```nextflow // PREVIOUS STAGE OUTPUT @@ -191,7 +191,7 @@ process fastp { script: """ - echo "I have been fastp'd" > ${fq} + echo "I have been fastp'd" > ${fq} mv ${fq} ${fq}.pG.fq """ } @@ -206,4 +206,4 @@ if (params.run_fastp) { .into { ch_fastp_for_adapterremoval; ch_fastp_for_skipadapterremoval } } -``` + ``` diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 596e363ee..b461caca3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -18,7 +18,8 @@ Please delete this text and anything that's not relevant from the template below I have checked the following places for your error: - [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) -- [ ] [nf-core/eager pipeline documentation](https://nf-co.re/nf-core/eager/usage) - nf-core/eager FAQ/troubleshooting can be found [here](https://nf-co.re/eager/usage#troubleshooting-and-faqs) +- [ ] [nf-core/eager pipeline documentation](https://nf-co.re/nf-core/eager/usage) + - nf-core/eager FAQ/troubleshooting can be found [here](https://nf-co.re/eager/usage#troubleshooting-and-faqs) ## Description of the bug diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6f09b12b6..864af6938 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -16,9 +16,9 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/eage - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](nf-core/eager/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/eager _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint .`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md index 80e155437..959f01ca4 100644 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -4,15 +4,15 @@ Please fill in the appropriate checklist below (delete whatever is not relevant) ## PR checklist -- [ ] This comment contains a description of changes (with reason). -- [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` - - [ ] If necessary, also make a PR on the [nf-core/eager branch on the nf-core/test-datasets repo](https://github.com/nf-core/test-datasets/pull/new/nf-core/eager). -- [ ] Make sure your code lints (`nf-core lint .`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). -- [ ] Usage Documentation in `docs/usage.md` is updated. -- [ ] Output Documentation in `docs/output.md` is updated. -- [ ] `CHANGELOG.md` is updated. -- [ ] `README.md` is updated (including new tool citations and authors/contributors). + - [ ] This comment contains a description of changes (with reason). + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If necessary, also make a PR on the [nf-core/eager branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/eager). + - [ ] Make sure your code lints (`nf-core lint .`). + - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). + - [ ] Usage Documentation in `docs/usage.md` is updated. + - [ ] Output Documentation in `docs/output.md` is updated. + - [ ] `CHANGELOG.md` is updated. + - [ ] `README.md` is updated (including new tool citations and authors/contributors). **Learn more about contributing:** https://github.com/nf-core/eager/tree/master/.github/CONTRIBUTING.md diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml index 24989492d..8d7eb53b0 100644 --- a/.github/markdownlint.yml +++ b/.github/markdownlint.yml @@ -2,11 +2,11 @@ default: true line-length: false no-duplicate-header: - siblings_only: true + siblings_only: true no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary + allowed_elements: + - img + - p + - kbd + - details + - summary diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index c12dc70a6..4e03e75be 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -9,6 +9,7 @@ on: types: [completed] workflow_dispatch: + env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -17,6 +18,7 @@ env: AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + jobs: run-awstest: name: Run AWS full tests diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index e889eed9e..6e0a9538c 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -6,6 +6,7 @@ name: nf-core AWS test on: workflow_dispatch: + env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -14,6 +15,7 @@ env: AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} + jobs: run-awstest: name: Run AWS tests diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 5b39a40ed..909b52d6b 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -15,6 +15,7 @@ jobs: run: | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/eager ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment @@ -42,3 +43,4 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2ab62933..8977cd31e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ["20.07.1", ""] + nxf_ver: ['20.07.1', ''] steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -58,7 +58,7 @@ jobs: run: | git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data - name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs - run: | + run: | if [[ $NXF_VER = '' ]]; then sleep 1200; fi - name: BASIC Run the basic pipeline with directly supplied single-end FASTQ run: | @@ -74,7 +74,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --save_reference - name: REFERENCE Basic workflow, with supplied indices run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --bwa_index 'results/reference_genome/bwa_index/BWAIndex/' --fasta_index 'https://github.com/nf-core/test-datasets/blob/eager/reference/Mammoth/Mammoth_MT_Krause.fasta.fai' - name: REFERENCE Run the basic pipeline with FastA reference with `fna` extension run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker @@ -107,7 +107,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' - name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_post_ar_trimming @@ -193,11 +193,11 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter - name: MALTEXTRACT Download resource files run: | - mkdir -p databases/maltextract - for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done + mkdir -p databases/maltextract + for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done - name: MALTEXTRACT Basic with MALT plus MaltExtract run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq' @@ -216,4 +216,4 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' \ No newline at end of file diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 771dfd721..77b4b9d07 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -45,6 +45,7 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false + YAML: runs-on: ubuntu-latest steps: @@ -81,9 +82,11 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false + nf-core: runs-on: ubuntu-latest steps: + - name: Check out pipeline code uses: actions/checkout@v2 @@ -96,8 +99,8 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: "3.6" - architecture: "x64" + python-version: '3.6' + architecture: 'x64' - name: Install dependencies run: | @@ -124,3 +127,4 @@ jobs: lint_log.txt lint_results.md PR_number.txt + diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 0471addcc..90f03c6f9 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,3 +1,4 @@ + name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork @@ -25,3 +26,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md + diff --git a/README.md b/README.md index 4103766da..1b4ff5b36 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ ## Introduction - **nf-core/eager** is a scalable and reproducible bioinformatics best-practise processing pipeline for genomic NGS sequencing data, with a focus on ancient DNA (aDNA) data. It is ideal for the (palaeo)genomic analysis of humans, animals, plants, microbes and even microbiomes. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The pipeline pre-processes raw data from FASTQ inputs, or preprocessed BAM inputs. It can align reads and performs extensive general NGS and aDNA specific quality-control on the results. It comes with docker, singularity or conda containers making installation trivial and results highly reproducible. @@ -35,23 +34,23 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 3. Download the pipeline and test it on a minimal dataset with a single command: - ```bash - nextflow run nf-core/eager -profile test, - ``` + ```bash + nextflow run nf-core/eager -profile test, + ``` - > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. 4. Start running your own analysis! - ```bash - nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' - ``` + ```bash + nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' + ``` 5. Once your run has completed successfully, clean up the intermediate files. - ```bash - nextflow clean -f -k - ``` + ```bash + nextflow clean -f -k + ``` See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. @@ -65,16 +64,16 @@ Modifications to the default pipeline are easily made using various options as d By default the pipeline currently performs the following: -- Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) -- Sequencing quality control (`FastQC`) -- Sequencing adapter removal, paired-end data merging (`AdapterRemoval`) -- Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) -- Post-mapping processing, statistics and conversion to bam (`samtools`) -- Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) -- PCR duplicate removal (`DeDup` or `MarkDuplicates`) -- Post-mapping statistics and BAM quality control (`Qualimap`) -- Library Complexity Estimation (`preseq`) -- Overall pipeline statistics summaries (`MultiQC`) +* Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) +* Sequencing quality control (`FastQC`) +* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`) +* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) +* Post-mapping processing, statistics and conversion to bam (`samtools`) +* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) +* PCR duplicate removal (`DeDup` or `MarkDuplicates`) +* Post-mapping statistics and BAM quality control (`Qualimap`) +* Library Complexity Estimation (`preseq`) +* Overall pipeline statistics summaries (`MultiQC`) ### Additional Steps @@ -82,40 +81,40 @@ Additional functionality contained by the pipeline currently includes: #### Input -- Automatic merging of complex sequencing setups (e.g. multiple lanes, sequencing configurations, library types) +* Automatic merging of complex sequencing setups (e.g. multiple lanes, sequencing configurations, library types) #### Preprocessing -- Illumina two-coloured sequencer poly-G tail removal (`fastp`) -- Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`) -- Automatic conversion of unmapped reads to FASTQ (`samtools`) -- Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples) +* Illumina two-coloured sequencer poly-G tail removal (`fastp`) +* Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`) +* Automatic conversion of unmapped reads to FASTQ (`samtools`) +* Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples) #### aDNA Damage manipulation -- Damage removal/clipping for UDG+/UDG-half treatment protocols (`BamUtil`) -- Damaged reads extraction and assessment (`PMDTools`) -- Nuclear DNA contamination estimation of human samples (`angsd`) +* Damage removal/clipping for UDG+/UDG-half treatment protocols (`BamUtil`) +* Damaged reads extraction and assessment (`PMDTools`) +* Nuclear DNA contamination estimation of human samples (`angsd`) #### Genotyping -- Creation of VCF genotyping files (`GATK UnifiedGenotyper`, `GATK HaplotypeCaller` and `FreeBayes`) -- Creation of EIGENSTRAT genotyping files (`pileupCaller`) -- Creation of Genotype Likelihood files (`angsd`) -- Consensus sequence FASTA creation (`VCF2Genome`) -- SNP Table generation (`MultiVCFAnalyzer`) +* Creation of VCF genotyping files (`GATK UnifiedGenotyper`, `GATK HaplotypeCaller` and `FreeBayes`) +* Creation of EIGENSTRAT genotyping files (`pileupCaller`) +* Creation of Genotype Likelihood files (`angsd`) +* Consensus sequence FASTA creation (`VCF2Genome`) +* SNP Table generation (`MultiVCFAnalyzer`) #### Biological Information -- Mitochondrial to Nuclear read ratio calculation (`MtNucRatioCalculator`) -- Statistical sex determination of human individuals (`Sex.DetERRmine`) +* Mitochondrial to Nuclear read ratio calculation (`MtNucRatioCalculator`) +* Statistical sex determination of human individuals (`Sex.DetERRmine`) #### Metagenomic Screening -- Low-sequenced complexity filtering (`BBduk`) -- Taxonomic binner with alignment (`MALT`) -- Taxonomic binner without alignment (`Kraken2`) -- aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) +* Low-sequenced complexity filtering (`BBduk`) +* Taxonomic binner with alignment (`MALT`) +* Taxonomic binner without alignment (`Kraken2`) +* aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) #### Functionality Overview @@ -131,11 +130,11 @@ The nf-core/eager pipeline comes with documentation about the pipeline: [usage]( 1. [Nextflow installation](https://nf-co.re/usage/installation) 2. Pipeline configuration - - [Pipeline installation](https://nf-co.re/usage/local_installation) - - [Adding your own system config](https://nf-co.re/usage/adding_own_config) - - [Reference genomes](https://nf-co.re/usage/reference_genomes) + * [Pipeline installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](https://nf-co.re/eager/docs/usage.md) - - This includes tutorials, FAQs, and troubleshooting instructions + * This includes tutorials, FAQs, and troubleshooting instructions 4. [Output and how to interpret the results](https://nf-co.re/eager/docs/output.md) ## Credits @@ -147,43 +146,43 @@ of this pipeline: ## Authors (alphabetical) -- [Aida Andrades Valtueña](https://github.com/aidaanva) -- [Alexander Peltzer](https://github.com/apeltzer) -- [James A. Fellows Yates](https://github.com/jfy133) -- [Judith Neukamm](https://github.com/JudithNeukamm) -- [Maxime Borry](https://github.com/maxibor) -- [Maxime Garcia](https://github.com/MaxUlysse) -- [Stephen Clayton](https://github.com/sc13-bioinf) -- [Thiseas C. Lamnidis](https://github.com/TCLamnidis) -- [Zandra Fagernäs](https://github.com/ZandraFagernas) +* [Aida Andrades Valtueña](https://github.com/aidaanva) +* [Alexander Peltzer](https://github.com/apeltzer) +* [James A. Fellows Yates](https://github.com/jfy133) +* [Judith Neukamm](https://github.com/JudithNeukamm) +* [Maxime Borry](https://github.com/maxibor) +* [Maxime Garcia](https://github.com/MaxUlysse) +* [Stephen Clayton](https://github.com/sc13-bioinf) +* [Thiseas C. Lamnidis](https://github.com/TCLamnidis) +* [Zandra Fagernäs](https://github.com/ZandraFagernas) ## Additional Contributors (alphabetical) Those who have provided conceptual guidance, suggestions, bug reports etc. -- [Alexandre Gilardet](https://github.com/alexandregilardet) -- Arielle Munters -- [Åshild Vågene](https://github.com/ashildv) -- [Charles Plessy](https://github.com/charles-plessy) -- [Elina Salmela](https://github.com/esalmela) -- [Hester van Schalkwyk](https://github.com/hesterjvs) -- [Ido Bar](https://github.com/IdoBar) -- [Irina Velsko](https://github.com/ivelsko) -- [Işın Altınkaya](https://github.com/isinaltinkaya) -- [Johan Nylander](https://github.com/nylander) -- [Katerine Eaton](https://github.com/ktmeaton) -- [Kathrin Nägele](https://github.com/KathrinNaegele) -- [Luc Venturini](https://github.com/lucventurini) -- [Marcel Keller](https://github.com/marcel-keller) -- [Megan Michel](https://github.com/meganemichel) -- [Pierre Lindenbaum](https://github.com/lindenb) -- [Pontus Skoglund](https://github.com/pontussk) -- [Raphael Eisenhofer](https://github.com/EisenRa) -- [Roberta Davidson](https://github.com/roberta-davidson) -- [Torsten Günter](https://bitbucket.org/tguenther/) -- [Kevin Lord](https://github.com/lordkev) -- [He Yu](https://github.com/paulayu) -- [Selina Carlhoff](https://github.com/scarlhoff) +* [Alexandre Gilardet](https://github.com/alexandregilardet) +* Arielle Munters +* [Åshild Vågene](https://github.com/ashildv) +* [Charles Plessy](https://github.com/charles-plessy) +* [Elina Salmela](https://github.com/esalmela) +* [Hester van Schalkwyk](https://github.com/hesterjvs) +* [Ido Bar](https://github.com/IdoBar) +* [Irina Velsko](https://github.com/ivelsko) +* [Işın Altınkaya](https://github.com/isinaltinkaya) +* [Johan Nylander](https://github.com/nylander) +* [Katerine Eaton](https://github.com/ktmeaton) +* [Kathrin Nägele](https://github.com/KathrinNaegele) +* [Luc Venturini](https://github.com/lucventurini) +* [Marcel Keller](https://github.com/marcel-keller) +* [Megan Michel](https://github.com/meganemichel) +* [Pierre Lindenbaum](https://github.com/lindenb) +* [Pontus Skoglund](https://github.com/pontussk) +* [Raphael Eisenhofer](https://github.com/EisenRa) +* [Roberta Davidson](https://github.com/roberta-davidson) +* [Torsten Günter](https://bitbucket.org/tguenther/) +* [Kevin Lord](https://github.com/lordkev) +* [He Yu](https://github.com/paulayu) +* [Selina Carlhoff](https://github.com/scarlhoff) If you've contributed and you're missing in here, please let us know and we will add you in of course! @@ -211,43 +210,43 @@ You can cite the `nf-core` publication as follows: In addition, references of tools and data used in this pipeline are as follows: -- **EAGER v1**, CircularMapper, DeDup\* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z). Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) -- **FastQC** Download: [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -- **AdapterRemoval v2** Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. [https://doi.org/10.1186/s13104-016-1900-2](https://doi.org/10.1186/s13104-016-1900-2). Download: [https://github.com/MikkelSchubert/adapterremoval](https://github.com/MikkelSchubert/adapterremoval) -- **bwa** Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. [https://doi.org/10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324). Download: [http://bio-bwa.sourceforge.net/bwa.shtml](http://bio-bwa.sourceforge.net/bwa.shtml) -- **SAMtools** Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352). Download: [http://www.htslib.org/](http://www.htslib.org/) -- **DamageProfiler** Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). [https://doi.org/10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: [https://github.com/Integrative-Transcriptomics/DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler) -- **QualiMap** Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. [https://doi.org/10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566). Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) -- **preseq** Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. [https://doi.org/10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375). Download: [http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) -- **PMDTools** Skoglund, P., Northoff, B. H., Shunkov, M. V., Derevianko, A. P., Pääbo, S., Krause, J., & Jakobsson, M. (2014). Separating endogenous ancient DNA from modern day contamination in a Siberian Neandertal. Proceedings of the National Academy of Sciences of the United States of America, 111(6), 2229–2234. [https://doi.org/10.1073/pnas.1318934111](https://doi.org/10.1073/pnas.1318934111). Download: [https://github.com/pontussk/PMDtools](https://github.com/pontussk/PMDtools) -- **MultiQC** Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. [https://doi.org/10.1093/bioinformatics/btw354](https://doi.org/10.1093/bioinformatics/btw354). Download: [https://multiqc.info/](https://multiqc.info/) -- **BamUtils** Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. [https://doi.org/10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114). Download: [https://genome.sph.umich.edu/wiki/BamUtil](https://genome.sph.umich.edu/wiki/BamUtil) -- **FastP** Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. [https://doi.org/10.1093/bioinformatics/bty560](https://doi.org/10.1093/bioinformatics/bty560). Download: [https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) -- **GATK 3.5** DePristo, M. A., Banks, E., Poplin, R., Garimella, K. V., Maguire, J. R., Hartl, C., … Daly, M. J. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics, 43(5), 491–498. [https://doi.org/10.1038/ng.806](https://doi.org/10.1038/ng.806.).Download: [https://console.cloud.google.com/storage/browser/gatk](https://console.cloud.google.com/storage/browser/gatk) -- **GATK 4.X** - no citation available yet. Download: [https://github.com/broadinstitute/gatk/releases](https://github.com/broadinstitute/gatk/releases) -- **VCF2Genome** - Alexander Herbig and Alex Peltzer (unpublished). Download: [https://github.com/apeltzer/VCF2Genome](https://github.com/apeltzer/VCF2Genome) -- **MultiVCFAnalyzer** Bos, K.I. et al., 2014. Pre-Columbian mycobacterial genomes reveal seals as a source of New World human tuberculosis. Nature, 514(7523), pp.494–497. Available at: [http://dx.doi.org/10.1038/nature13591](http://dx.doi.org/10.1038/nature13591). Download: [https://github.com/alexherbig/MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) -- **MTNucRatioCalculator** Alex Peltzter (Unpublished). Download: [https://github.com/apeltzer/MTNucRatioCalculator](https://github.com/apeltzer/MTNucRatioCalculator) -- **Sex.DetERRmine.py** Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: [http://dx.doi.org/10.1038/s41467-018-07483-5](http://dx.doi.org/10.1038/s41467-018-07483-5). Download: [https://github.com/TCLamnidis/Sex.DetERRmine.git](https://github.com/TCLamnidis/Sex.DetERRmine.git) -- **ANGSD** Korneliussen, T.S., Albrechtsen, A. & Nielsen, R., 2014. ANGSD: Analysis of Next Generation Sequencing Data. BMC bioinformatics, 15, p.356. Available at: [http://dx.doi.org/10.1186/s12859-014-0356-4](http://dx.doi.org/10.1186/s12859-014-0356-4). Download: [https://github.com/ANGSD/angsd](https://github.com/ANGSD/angsd) -- **bedtools** Quinlan, A.R. & Hall, I.M., 2010. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics , 26(6), pp.841–842. Available at: [http://dx.doi.org/10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033). Download: [https://github.com/arq5x/bedtools2/releases](https://github.com/arq5x/bedtools2/) -- **MALT**. Download: [https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html) - - Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). - - Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). -- **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) -- **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) -- **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) -- **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). -- **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) -- **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) -- **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) -- **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) +* **EAGER v1**, CircularMapper, DeDup* Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. [https://doi.org/10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z). Download: [https://github.com/apeltzer/EAGER-GUI](https://github.com/apeltzer/EAGER-GUI) and [https://github.com/apeltzer/EAGER-CLI](https://github.com/apeltzer/EAGER-CLI) +* **FastQC** Download: [https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +* **AdapterRemoval v2** Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. [https://doi.org/10.1186/s13104-016-1900-2](https://doi.org/10.1186/s13104-016-1900-2). Download: [https://github.com/MikkelSchubert/adapterremoval](https://github.com/MikkelSchubert/adapterremoval) +* **bwa** Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. [https://doi.org/10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324). Download: [http://bio-bwa.sourceforge.net/bwa.shtml](http://bio-bwa.sourceforge.net/bwa.shtml) +* **SAMtools** Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352). Download: [http://www.htslib.org/](http://www.htslib.org/) +* **DamageProfiler** Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). [https://doi.org/10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: [https://github.com/Integrative-Transcriptomics/DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler) +* **QualiMap** Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. [https://doi.org/10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566). Download: [http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) +* **preseq** Daley, T., & Smith, A. D. (2013). Predicting the molecular complexity of sequencing libraries. Nature Methods, 10(4), 325–327. [https://doi.org/10.1038/nmeth.2375](https://doi.org/10.1038/nmeth.2375). Download: [http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) +* **PMDTools** Skoglund, P., Northoff, B. H., Shunkov, M. V., Derevianko, A. P., Pääbo, S., Krause, J., & Jakobsson, M. (2014). Separating endogenous ancient DNA from modern day contamination in a Siberian Neandertal. Proceedings of the National Academy of Sciences of the United States of America, 111(6), 2229–2234. [https://doi.org/10.1073/pnas.1318934111](https://doi.org/10.1073/pnas.1318934111). Download: [https://github.com/pontussk/PMDtools](https://github.com/pontussk/PMDtools) +* **MultiQC** Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. [https://doi.org/10.1093/bioinformatics/btw354](https://doi.org/10.1093/bioinformatics/btw354). Download: [https://multiqc.info/](https://multiqc.info/) +* **BamUtils** Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. [https://doi.org/10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114). Download: [https://genome.sph.umich.edu/wiki/BamUtil](https://genome.sph.umich.edu/wiki/BamUtil) +* **FastP** Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. [https://doi.org/10.1093/bioinformatics/bty560](https://doi.org/10.1093/bioinformatics/bty560). Download: [https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) +* **GATK 3.5** DePristo, M. A., Banks, E., Poplin, R., Garimella, K. V., Maguire, J. R., Hartl, C., … Daly, M. J. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics, 43(5), 491–498. [https://doi.org/10.1038/ng.806](https://doi.org/10.1038/ng.806.).Download: [https://console.cloud.google.com/storage/browser/gatk](https://console.cloud.google.com/storage/browser/gatk) +* **GATK 4.X** - no citation available yet. Download: [https://github.com/broadinstitute/gatk/releases](https://github.com/broadinstitute/gatk/releases) +* **VCF2Genome** - Alexander Herbig and Alex Peltzer (unpublished). Download: [https://github.com/apeltzer/VCF2Genome](https://github.com/apeltzer/VCF2Genome) +* **MultiVCFAnalyzer** Bos, K.I. et al., 2014. Pre-Columbian mycobacterial genomes reveal seals as a source of New World human tuberculosis. Nature, 514(7523), pp.494–497. Available at: [http://dx.doi.org/10.1038/nature13591](http://dx.doi.org/10.1038/nature13591). Download: [https://github.com/alexherbig/MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) +* **MTNucRatioCalculator** Alex Peltzter (Unpublished). Download: [https://github.com/apeltzer/MTNucRatioCalculator](https://github.com/apeltzer/MTNucRatioCalculator) +* **Sex.DetERRmine.py** Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: [http://dx.doi.org/10.1038/s41467-018-07483-5](http://dx.doi.org/10.1038/s41467-018-07483-5). Download: [https://github.com/TCLamnidis/Sex.DetERRmine.git](https://github.com/TCLamnidis/Sex.DetERRmine.git) +* **ANGSD** Korneliussen, T.S., Albrechtsen, A. & Nielsen, R., 2014. ANGSD: Analysis of Next Generation Sequencing Data. BMC bioinformatics, 15, p.356. Available at: [http://dx.doi.org/10.1186/s12859-014-0356-4](http://dx.doi.org/10.1186/s12859-014-0356-4). Download: [https://github.com/ANGSD/angsd](https://github.com/ANGSD/angsd) +* **bedtools** Quinlan, A.R. & Hall, I.M., 2010. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics , 26(6), pp.841–842. Available at: [http://dx.doi.org/10.1093/bioinformatics/btq033](http://dx.doi.org/10.1093/bioinformatics/btq033). Download: [https://github.com/arq5x/bedtools2/releases](https://github.com/arq5x/bedtools2/) +* **MALT**. Download: [https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html) + * Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). + * Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). +* **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) +* **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) +* **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) +* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). +* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) +* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) +* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) +* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) ## Data References This repository uses test data from the following studies: -- Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1). -- Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257). -- Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114). -- de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2) +* Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1). +* Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257). +* Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114). +* de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2) diff --git a/assets/angsd_resources/README b/assets/angsd_resources/README index 0f2b8c018..49cfd6c06 100644 --- a/assets/angsd_resources/README +++ b/assets/angsd_resources/README @@ -7,8 +7,9 @@ wget http://hapmap.ncbi.nlm.nih.gov/downloads/frequencies/2010-08_phaseII+III/al wget http://hapmap.ncbi.nlm.nih.gov/downloads/frequencies/2010-08_phaseII+III/allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz #with the md5sum -a105316eaa2ebbdb3f8d62a9cb10a2d5 allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz -5a0f920951ce2ded4afe2f10227110ac allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz +a105316eaa2ebbdb3f8d62a9cb10a2d5 allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt.gz +5a0f920951ce2ded4afe2f10227110ac allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz + ##create dummy bed file to use the liftover tools gunzip -c allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz| awk '{print $2" "$3-1" "$3" "$11" "$12" "$4" "$14}'|sed 1d >allele.txt @@ -17,7 +18,8 @@ gunzip -c allele_freqs_chrX_CEU_r28_nr.b36_fwd.txt.gz| awk '{print $2" "$3-1" "$ liftOver allele.txt /opt/liftover/hg18ToHg19.over.chain.gz hit nohit ##now remove invarible sites, and redundant columns -cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchrX.gz +cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchrX.gz + ##create dummy bed file to use the liftover tools gunzip -c allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt| awk '{print $2" "$3-1" "$3" "$11" "$12" "$4" "$14}'|sed 1d >allele.txt @@ -26,14 +28,15 @@ gunzip -c allele_freqs_chr21_CEU_r28_nr.b36_fwd.txt| awk '{print $2" "$3-1" "$3" liftOver allele.txt /opt/liftover/hg18ToHg19.over.chain.gz hit nohit ##now remove invarible sites, and redundant columns -cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchr21.gz +cut -f1,3 --complement hit |grep -v -P "\t1.0"|grep -v -P "\t0\t"|gzip -c >HapMapchr21.gz + ####### ##download 100kmer mappability wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign100mer.bigWig #md5sum -a1b1a8c99431fedf6a3b4baef028cca4 wgEncodeCrgMapabilityAlign100mer.bigWig +a1b1a8c99431fedf6a3b4baef028cca4 wgEncodeCrgMapabilityAlign100mer.bigWig ##download convert program wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph @@ -42,6 +45,6 @@ wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph ./bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig chrX -chrom=chrX ./bigWigToBedGraph wgEncodeCrgMapabilityAlign100mer.bigWig chr21 -chrom=chr21 -##only keep unique regions and discard the chr\* column +##only keep unique regions and discard the chr* column grep -P "\t1$" chr21 |cut -f2-3 |gzip -c >chr21.unique.gz grep -P "\t1$" chrX |cut -f2-3 |gzip -c >chrX.unique.gz diff --git a/assets/email_template.html b/assets/email_template.html index 1e4a996f1..36bfc9c8d 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -1,134 +1,53 @@ - - - - + + + + - - Codestin Search App - - -
- + + Codestin Search App + + +
-

nf-core/eager v${version}

-

Run Name: $runName

+ - <% if (!success){ out << """ -
-

- nf-core/eager execution completed unsuccessfully! -

-

- The exit status of the task that caused the workflow execution to fail - was: $exitStatus. -

+

nf-core/eager v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/eager execution completed unsuccessfully!

+

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

-
-${errorReport}
-
- """ } else { out << """ -
+
${errorReport}
+
+ """ +} else { + out << """ +
nf-core/eager execution completed successfully! -
- """ } %> +
+ """ +} +%> -

- The workflow was completed at $dateComplete (duration: - $duration) -

-

The command used to launch the workflow was as follows:

-
-$commandLine
+

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
-

Pipeline Configuration:

- - - <% out << summary.collect{ k,v -> " - - - - - " }.join("\n") %> - -
- $k - -
$v
-
+

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
-

nf-core/eager

-

- https://github.com/nf-core/eager -

-
- +

nf-core/eager

+

https://github.com/nf-core/eager

+ +
+ + diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 354485c2a..ba9050a05 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,316 +1,316 @@ -custom_logo: "nf-core_eager_logo_outline_drop.png" +custom_logo: 'nf-core_eager_logo_outline_drop.png' custom_logo_url: https://github.com/nf-core/eager/ -custom_logo_title: "nf-core/eager" +custom_logo_title: 'nf-core/eager' report_comment: > - This report has been generated by the nf-core/eager - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/eager + analysis pipeline. For information about how to interpret these results, please see the + documentation. run_modules: - - adapterRemoval - - bowtie2 - - custom_content - - damageprofiler - - dedup - - fastp - - fastqc - - gatk - - kraken - - malt - - mtnucratio - - multivcfanalyzer - - picard - - preseq - - qualimap - - samtools - - sexdeterrmine - - hops - - bcftools + - adapterRemoval + - bowtie2 + - custom_content + - damageprofiler + - dedup + - fastp + - fastqc + - gatk + - kraken + - malt + - mtnucratio + - multivcfanalyzer + - picard + - preseq + - qualimap + - samtools + - sexdeterrmine + - hops + - bcftools extra_fn_clean_exts: - - "_fastp" - - ".pe.settings" - - ".se.settings" - - ".settings" - - ".pe.combined" - - ".se.truncated" - - ".mapped" - - ".mapped_rmdup" - - ".mapped_rmdup_stats" - - "_libmerged_rg_rmdup" - - "_libmerged_rg_rmdup_stats" - - "_postfilterflagstat.stats" - - "_flagstat.stat" - - ".filtered" - - ".filtered_rmdup" - - ".filtered_rmdup_stats" - - "_libmerged_rg_add" - - "_libmerged_rg_add_stats" - - "_rmdup" - - ".unmapped" - - ".fastq.gz" - - ".fastq" - - ".fq.gz" - - ".fq" - - ".bam" - - ".kreport" - - ".unifiedgenotyper" - - ".trimmed_stats" - - "_libmerged" - - "_bt2" + - '_fastp' + - '.pe.settings' + - '.se.settings' + - '.settings' + - '.pe.combined' + - '.se.truncated' + - '.mapped' + - '.mapped_rmdup' + - '.mapped_rmdup_stats' + - '_libmerged_rg_rmdup' + - '_libmerged_rg_rmdup_stats' + - '_postfilterflagstat.stats' + - '_flagstat.stat' + - '.filtered' + - '.filtered_rmdup' + - '.filtered_rmdup_stats' + - '_libmerged_rg_add' + - '_libmerged_rg_add_stats' + - '_rmdup' + - '.unmapped' + - '.fastq.gz' + - '.fastq' + - '.fq.gz' + - '.fq' + - '.bam' + - '.kreport' + - '.unifiedgenotyper' + - '.trimmed_stats' + - '_libmerged' + - '_bt2' top_modules: - - "fastqc": - name: "FastQC (pre-Trimming)" - path_filters: - - "*_raw_fastqc.zip" - - "fastp" - - "adapterRemoval" - - "fastqc": - name: "FastQC (post-Trimming)" - path_filters: - - "*.truncated_fastqc.zip" - - "*.combined*_fastqc.zip" - - "bowtie2": - path_filters: - - "*_bt2.log" - - "malt" - - "hops" - - "kraken" - - "samtools": - name: "Samtools Flagstat (pre-samtools filter)" - path_filters: - - "*_flagstat.stats" - - "samtools": - name: "Samtools Flagstat (post-samtools filter)" - path_filters: - - "*_postfilterflagstat.stats" - - "dedup" - - "picard" - - "preseq": - path_filters: - - "*.preseq" - - "damageprofiler" - - "mtnucratio" - - "qualimap" - - "sexdeterrmine" - - "bcftools" - - "multivcfanalyzer": - path_filters: - - "*MultiVCFAnalyzer.json" + - 'fastqc': + name: 'FastQC (pre-Trimming)' + path_filters: + - '*_raw_fastqc.zip' + - 'fastp' + - 'adapterRemoval' + - 'fastqc': + name: 'FastQC (post-Trimming)' + path_filters: + - '*.truncated_fastqc.zip' + - '*.combined*_fastqc.zip' + - 'bowtie2': + path_filters: + - '*_bt2.log' + - 'malt' + - 'hops' + - 'kraken' + - 'samtools': + name: 'Samtools Flagstat (pre-samtools filter)' + path_filters: + - '*_flagstat.stats' + - 'samtools': + name: 'Samtools Flagstat (post-samtools filter)' + path_filters: + - '*_postfilterflagstat.stats' + - 'dedup' + - 'picard' + - 'preseq': + path_filters: + - '*.preseq' + - 'damageprofiler' + - 'mtnucratio' + - 'qualimap' + - 'sexdeterrmine' + - 'bcftools' + - 'multivcfanalyzer': + path_filters: + - '*MultiVCFAnalyzer.json' qualimap_config: - general_stats_coverage: - - 1 - - 2 - - 3 - - 4 - - 5 + general_stats_coverage: + - 1 + - 2 + - 3 + - 4 + - 5 remove_sections: - - sexdeterrmine-snps + - sexdeterrmine-snps table_columns_visible: - FastQC (pre-Trimming): - percent_duplicates: False - percent_gc: True - avg_sequence_length: True - fastp: - pct_duplication: False - after_filtering_gc_content: True - pct_surviving: False - Adapter Removal: - aligned_total: False - percent_aligned: True - FastQC (post-Trimming): - avg_sequence_length: True - percent_duplicates: False - total_sequences: True - percent_gc: True - bowtie2: - overall_alignment_rate: True - MALT: - Taxonomic assignment success: False - Assig. Taxonomy: False - Mappability: True - Total reads: False - Num. of queries: False - Kraken: - "% Unclassified": True - "% Top 5": False - Samtools Flagstat (pre-samtools filter): - flagstat_total: True - mapped_passed: True - Samtools Flagstat (post-samtools filter): - mapped_passed: True - DeDup: - dup_rate: False - clusterfactor: True - mapped_after_dedup: True - Picard: - PERCENT_DUPLICATION: True - DamageProfiler: - 5 Prime1: True - 5 Prime2: True - 3 Prime1: False - 3 Prime2: False - mean_readlength: True - median: True - mtnucratio: - mt_nuc_ratio: True - QualiMap: - mapped_reads: True - mean_coverage: True - 1_x_pc: True - 5_x_pc: True - percentage_aligned: False - median_insert_size: False - MultiVCFAnalyzer: - Heterozygous SNP alleles (percent): True - endorSpy: - endogenous_dna: True - endogenous_dna_post: True - nuclear_contamination: - Num_SNPs: True - Method1_MOM_estimate: False - Method1_MOM_SE: False - Method1_ML_estimate: True - Method1_ML_SE: True - Method2_MOM_estimate: False - Method2_MOM_SE: False - Method2_ML_estimate: False - Method2_ML_SE: False - snp_coverage: - Covered_Snps: True - Total_Snps: False + FastQC (pre-Trimming): + percent_duplicates: False + percent_gc: True + avg_sequence_length: True + fastp: + pct_duplication: False + after_filtering_gc_content: True + pct_surviving: False + Adapter Removal: + aligned_total: False + percent_aligned: True + FastQC (post-Trimming): + avg_sequence_length: True + percent_duplicates: False + total_sequences: True + percent_gc: True + bowtie2: + overall_alignment_rate: True + MALT: + Taxonomic assignment success: False + Assig. Taxonomy: False + Mappability: True + Total reads: False + Num. of queries: False + Kraken: + '% Unclassified': True + '% Top 5': False + Samtools Flagstat (pre-samtools filter): + flagstat_total: True + mapped_passed: True + Samtools Flagstat (post-samtools filter): + mapped_passed: True + DeDup: + dup_rate: False + clusterfactor: True + mapped_after_dedup: True + Picard: + PERCENT_DUPLICATION: True + DamageProfiler: + 5 Prime1: True + 5 Prime2: True + 3 Prime1: False + 3 Prime2: False + mean_readlength: True + median: True + mtnucratio: + mt_nuc_ratio: True + QualiMap: + mapped_reads: True + mean_coverage: True + 1_x_pc: True + 5_x_pc: True + percentage_aligned: False + median_insert_size: False + MultiVCFAnalyzer: + Heterozygous SNP alleles (percent): True + endorSpy: + endogenous_dna: True + endogenous_dna_post: True + nuclear_contamination: + Num_SNPs: True + Method1_MOM_estimate: False + Method1_MOM_SE: False + Method1_ML_estimate: True + Method1_ML_SE: True + Method2_MOM_estimate: False + Method2_MOM_SE: False + Method2_ML_estimate: False + Method2_ML_SE: False + snp_coverage: + Covered_Snps: True + Total_Snps: False table_columns_placement: - FastQC (pre-Trimming): - total_sequences: 100 - avg_sequence_length: 110 - percent_gc: 120 - fastp: - after_filtering_gc_content: 200 - Adapter Removal: - percent_aligned: 300 - FastQC (post-Trimming): - total_sequences: 400 - avg_sequence_length: 410 - percent_gc: 420 - Bowtie 2 / HiSAT2: - overall_alignment_rate: 450 - MALT: - Num. of queries: 430 - Total reads: 440 - Mappability: 450 - Assig. Taxonomy: 460 - Taxonomic assignment success: 470 - Kraken: - "% Unclassified": 480 - Samtools Flagstat (pre-samtools filter): - flagstat_total: 551 - mapped_passed: 552 - Samtools Flagstat (post-samtools filter): - flagstat_total: 600 - mapped_passed: 620 - endorSpy: - endogenous_dna: 610 - endogenous_dna_post: 640 - nuclear_contamination: - Num_SNPs: 1100 - Method1_MOM_estimate: 1110 - Method1_MOM_SE: 1120 - Method1_ML_estimate: 1130 - Method1_ML_SE: 1140 - Method2_MOM_estimate: 1150 - Method2_MOM_SE: 1160 - Method2_ML_estimate: 1170 - Method2_ML_SE: 1180 - snp_coverage: - Covered_Snps: 1050 - Total_Snps: 1060 - DeDup: - mapped_after_dedup: 620 - clusterfactor: 630 - Picard: - PERCENT_DUPLICATION: 650 - DamageProfiler: - 5 Prime1: 700 - 5 Prime2: 710 - 3 Prime1: 720 - 3 Prime2: 730 - mean_readlength: 740 - median: 750 - mtnucratio: - mtreads: 760 - mt_cov_avg: 770 - mt_nuc_ratio: 780 - QualiMap: - mapped_reads: 800 - mean_coverage: 805 - median_coverage: 810 - 1_x_pc: 820 - 2_x_pc: 830 - 3_x_pc: 840 - 4_x_pc: 850 - 5_x_pc: 860 - avg_gc: 870 - sexdeterrmine: - RateX: 1000 - RateY: 1010 - MultiVCFAnalyzer: - Heterozygous SNP alleles (percent): 1200 + FastQC (pre-Trimming): + total_sequences: 100 + avg_sequence_length: 110 + percent_gc: 120 + fastp: + after_filtering_gc_content: 200 + Adapter Removal: + percent_aligned: 300 + FastQC (post-Trimming): + total_sequences: 400 + avg_sequence_length: 410 + percent_gc: 420 + Bowtie 2 / HiSAT2: + overall_alignment_rate: 450 + MALT: + Num. of queries: 430 + Total reads: 440 + Mappability: 450 + Assig. Taxonomy: 460 + Taxonomic assignment success: 470 + Kraken: + '% Unclassified': 480 + Samtools Flagstat (pre-samtools filter): + flagstat_total: 551 + mapped_passed: 552 + Samtools Flagstat (post-samtools filter): + flagstat_total: 600 + mapped_passed: 620 + endorSpy: + endogenous_dna: 610 + endogenous_dna_post: 640 + nuclear_contamination: + Num_SNPs: 1100 + Method1_MOM_estimate: 1110 + Method1_MOM_SE: 1120 + Method1_ML_estimate: 1130 + Method1_ML_SE: 1140 + Method2_MOM_estimate: 1150 + Method2_MOM_SE: 1160 + Method2_ML_estimate: 1170 + Method2_ML_SE: 1180 + snp_coverage: + Covered_Snps: 1050 + Total_Snps: 1060 + DeDup: + mapped_after_dedup: 620 + clusterfactor: 630 + Picard: + PERCENT_DUPLICATION: 650 + DamageProfiler: + 5 Prime1: 700 + 5 Prime2: 710 + 3 Prime1: 720 + 3 Prime2: 730 + mean_readlength: 740 + median: 750 + mtnucratio: + mtreads: 760 + mt_cov_avg: 770 + mt_nuc_ratio: 780 + QualiMap: + mapped_reads: 800 + mean_coverage: 805 + median_coverage: 810 + 1_x_pc: 820 + 2_x_pc: 830 + 3_x_pc: 840 + 4_x_pc: 850 + 5_x_pc: 860 + avg_gc: 870 + sexdeterrmine: + RateX: 1000 + RateY: 1010 + MultiVCFAnalyzer: + Heterozygous SNP alleles (percent): 1200 read_count_multiplier: 1 -read_count_prefix: "" -read_count_desc: "" -ancient_read_count_prefix: "" -ancient_read_count_desc: "" +read_count_prefix: '' +read_count_desc: '' +ancient_read_count_prefix: '' +ancient_read_count_desc: '' ancient_read_count_multiplier: 1 -decimalPoint_format: "." -thousandsSep_format: "," +decimalPoint_format: '.' +thousandsSep_format: ',' report_section_order: - software_versions: - order: -1000 - nf-core-eager-summary: - order: -1001 + software_versions: + order: -1000 + nf-core-eager-summary: + order: -1001 export_plots: true table_columns_name: - FastQC (pre-Trimming): - total_sequences: "Nr. Input Reads" - avg_sequence_length: "Length Input Reads" - percent_gc: "% GC Input Reads" - percent_duplicates: "% Dups Input Reads" - percent_fails: "% Failed Input Reads" - FastQC (post-Trimming): - total_sequences: "Nr. Processed Reads" - avg_sequence_length: "Length Processed Reads" - percent_gc: "% GC Processed Reads" - percent_duplicates: "% Dups Processed Reads" - percent_fails: "%Failed Processed Reads" - Samtools Flagstat (pre-samtools filter): - flagstat_total: "Nr. Reads Into Mapping" - mapped_passed: "Nr. Mapped Reads" - Samtools Flagstat (post-samtools filter): - flagstat_total: "Nr. Mapped Reads Post-Filter" - mapped_passed: "Nr. Mapped Reads Passed Post-Filter" - Endogenous DNA Post (%): - endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)" - Picard: - PERCENT_DUPLICATION: "% Dup. Mapped Reads" - DamageProfiler: - mean_readlength: "Mean Length Mapped Reads" - median_readlength: "Median Length Mapped Reads" - QualiMap: - mapped_reads: "Nr. Dedup. Mapped Reads" - total_reads: "Nr. Dedup. Total Reads" - avg_gc: "% GC Dedup. Mapped Reads" - Bcftools Stats: - number_of_records: "Nr. Overall Variants" - number_of_SNPs: "Nr. SNPs" - number_of_indels: "Nr. InDels" - MALT: - Mappability: "% Metagenomic Mappability" - SexDetErrmine: - RateErrX: "SexDet Err X Chr" - RateErrY: "SexDet Err Y Chr" - RateX: "SexDet Rate X Chr" - RateY: "SexDet Rate Y Chr" + FastQC (pre-Trimming): + total_sequences: "Nr. Input Reads" + avg_sequence_length: "Length Input Reads" + percent_gc: "% GC Input Reads" + percent_duplicates: "% Dups Input Reads" + percent_fails: "% Failed Input Reads" + FastQC (post-Trimming): + total_sequences: "Nr. Processed Reads" + avg_sequence_length: "Length Processed Reads" + percent_gc: "% GC Processed Reads" + percent_duplicates: "% Dups Processed Reads" + percent_fails: "%Failed Processed Reads" + Samtools Flagstat (pre-samtools filter): + flagstat_total: "Nr. Reads Into Mapping" + mapped_passed: "Nr. Mapped Reads" + Samtools Flagstat (post-samtools filter): + flagstat_total: "Nr. Mapped Reads Post-Filter" + mapped_passed: "Nr. Mapped Reads Passed Post-Filter" + Endogenous DNA Post (%): + endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)" + Picard: + PERCENT_DUPLICATION: "% Dup. Mapped Reads" + DamageProfiler: + mean_readlength: "Mean Length Mapped Reads" + median_readlength: "Median Length Mapped Reads" + QualiMap: + mapped_reads: "Nr. Dedup. Mapped Reads" + total_reads: "Nr. Dedup. Total Reads" + avg_gc: "% GC Dedup. Mapped Reads" + Bcftools Stats: + number_of_records: "Nr. Overall Variants" + number_of_SNPs: "Nr. SNPs" + number_of_indels: "Nr. InDels" + MALT: + Mappability: "% Metagenomic Mappability" + SexDetErrmine: + RateErrX: "SexDet Err X Chr" + RateErrY: "SexDet Err Y Chr" + RateX: "SexDet Rate X Chr" + RateY: "SexDet Rate Y Chr" \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index e7a82cd74..64eeacd97 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,21 +2,22 @@ The nf-core/eager documentation is split into the following pages: -- [Usage](usage.md) - - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - - Also includes: FAQ, Troubleshooting and Tutorials -- [Output](output.md) - - An overview of the different results produced by the pipeline and how to interpret them. +* [Usage](usage.md) + * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. + * Also includes: FAQ, Troubleshooting and Tutorials +* [Output](output.md) + * An overview of the different results produced by the pipeline and how to interpret them. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re). Additional pages are: -- [Installation](https://nf-co.re/usage/installation) -- Pipeline configuration - - [Local installation](https://nf-co.re/usage/local_installation) - - [Adding your own system config](https://nf-co.re/usage/adding_own_config) - - [Reference genomes](https://nf-co.re/usage/reference_genomes) -- [Contribution Guidelines](../.github/CONTRIBUTING.md) - - Basic contribution & behaviour guidelines - - Checklists and guidelines for people who would like to contribute code +* [Installation](https://nf-co.re/usage/installation) +* Pipeline configuration + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) +* [Contribution Guidelines](../.github/CONTRIBUTING.md) + * Basic contribution & behaviour guidelines + * Checklists and guidelines for people who would like to contribute code + \ No newline at end of file diff --git a/docs/output.md b/docs/output.md index a9fbda3d9..8acdbe1d4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -19,25 +19,25 @@ results/ work/ ``` -- The parent directory `` is the parent directory of the run, either the directory the pipeline was run from or as specified by the `--outdir` flag. The default name of the output directory (unless otherwise specified) will be `./results/`. +* The parent directory `` is the parent directory of the run, either the directory the pipeline was run from or as specified by the `--outdir` flag. The default name of the output directory (unless otherwise specified) will be `./results/`. ### Primary Output Directories These directories are the ones you will use on a day-to-day basis and are those which you should familiarise yourself with. -- The `MultiQC` directory is the most important directory and contains the main summary report of the run in HTML format, which can be viewed in a web-browser of your choice. The sub-directory contains the MultiQC collected data used to build the HTML report. The Report allows you to get an overview of the sequencing and mapping quality as well as aDNA metrics (see the [MultiQC Report](#multiqc-report) section for more detail). -- A `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular part of the pipeline. +* The `MultiQC` directory is the most important directory and contains the main summary report of the run in HTML format, which can be viewed in a web-browser of your choice. The sub-directory contains the MultiQC collected data used to build the HTML report. The Report allows you to get an overview of the sequencing and mapping quality as well as aDNA metrics (see the [MultiQC Report](#multiqc-report) section for more detail). +* A `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular part of the pipeline. ### Secondary Output Directories These are less important directories which are used less often, normally in the context of bug-reporting. -- `pipeline_info/`: [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. - - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. - - Documentation for interpretation of results in HTML format: `results_description.html`. -- `reference_genome/` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxiliary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not important. -- The `work/` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. +* `pipeline_info/`: [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + * Documentation for interpretation of results in HTML format: `results_description.html`. +* `reference_genome/` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxiliary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not important. +* The `work/` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. > :warning: Note that `work/` will be created wherever you are running the `nextflow run` command from, unless you specify the location with `-w`, i.e. it will not by default be in `outdir`!. @@ -61,41 +61,41 @@ Each column name is supplied by the module, so you may see similar column names. The possible columns displayed by default are as follows (note you may see additional columns depending on what other modules you activate): -- **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. -- **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. -- **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. -- **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. -- **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC %GC column. -- **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. -- **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. -- **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. -- **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules -- **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below). -- **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. -- **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias. -- **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping. -- **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. -- **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. -- **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) -- **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. -- **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. -- **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). -- **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. -- **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -- **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -- **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. -- **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position. -- **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. -- **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. -- **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. -- **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. -- **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. -- **#SNPs Covered** This is from eigenstrat_snp_coverage. The number of called SNPs after genotyping with pileupcaller. -- **#SNPs Total** This is from eigenstrat_snp_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. -- **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. -- **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button. -- **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button. -- **% Hets** This is from MultiVCFAnalyzer. This reports the number of SNPs on an assumed haploid organism that have two possible alleles. A high percentage may indicate cross-mapping from a related species. +* **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. +* **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. +* **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. +* **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. +* **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC %GC column. +* **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. +* **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. +* **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. +* **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules +* **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below). +* **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias. +* **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias. +* **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping. +* **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering. +* **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. +* **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) +* **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. +* **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1 / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. +* **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective). +* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base. +* **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +* **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +* **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications. +* **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position. +* **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. +* **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. +* **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type. +* **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome. +* **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome. +* **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller. +* **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`. +* **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region. +* **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button. +* **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button. +* **% Hets** This is from MultiVCFAnalyzer. This reports the number of SNPs on an assumed haploid organism that have two possible alleles. A high percentage may indicate cross-mapping from a related species. For other non-default columns (activated under 'Configure Columns'), hover over the column name for further descriptions. @@ -107,13 +107,13 @@ For other non-default columns (activated under 'Configure Columns'), hover over You will receive output for each supplied FASTQ file. -When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will _normally_ be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). +When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will *normally* be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. +> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. > :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming. #### Sequence Counts @@ -138,10 +138,10 @@ You will often see that the first 5 or so bases have slightly lower quality than Things to watch out for: -- all positions having Phred scores less than 27 -- a sharp drop-off of quality early in the read -- for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. - +* all positions having Phred scores less than 27 +* a sharp drop-off of quality early in the read +* for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. + #### Per Sequence Quality Scores This is a further summary of the previous plot. This is a histogram of the _overall_ read quality (compared to per-base, above). The x axis is the mean read-quality score (summarising all the bases of the read in a single value), and the y-axis is the number of reads with this Phred score. You should see a peak with the majority of your reads between 27-35. @@ -152,9 +152,9 @@ This is a further summary of the previous plot. This is a histogram of the _over Things to watch out for: -- bi-modal peaks which suggests artefacts in some of the sequencing cycles -- all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). - +* bi-modal peaks which suggests artefacts in some of the sequencing cycles +* all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). + #### Per Base Sequencing Content This is a heatmap which shows the average percentage of C, G, T, and A nucleotides across ~4bp bins across all reads. @@ -167,7 +167,7 @@ You expect to see whole heatmap to be a relatively equal block of colour (normal Things to watch out for: -- If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). +* If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). > If you see Poly-G tails, we recommend to turn on FastP poly-G trimming with EAGER. See the 'running' documentation page for details. @@ -181,7 +181,7 @@ This line graph shows the number percentage reads (y-axis) with an average perce Things to watch out for: -- If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running nf-core/eager using the poly-G trimming option from `fastp` See the 'running' documentation page for details. +* If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running nf-core/eager using the poly-G trimming option from `fastp` See the 'running' documentation page for details. #### Per Base N Content @@ -251,13 +251,13 @@ The pipeline will generate the respective output for each supplied FASTQ file. This line plot shows the average GC content (Y axis) across each nucleotide of the reads (X-axis). There are two buttons per read (i.e. 2 for single-end, and 4 for paired-end) representing before and after the poly-G tail trimming. -Before filtering, if you have poly-G tails, you should see the lines going up at the end of the right-hand side of the plot. +Before filtering, if you have poly-G tails, you should see the lines going up at the end of the right-hand side of the plot. After filtering, you should see that the average GC content along the reads is now reduced to around the general trend of the entire read. Things to look out for: -- If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. +* If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. ### AdapterRemoval @@ -265,10 +265,10 @@ Things to look out for: AdapterRemoval a tool that does the post-sequencing clean up of your sequencing reads. It performs the following functions -- 'Merges' (or 'collapses') forward and reverse reads of Paired End data -- Removes remaining library indexing adapters -- Trims low quality base tails from ends of reads -- Removes too-short reads +* 'Merges' (or 'collapses') forward and reverse reads of Paired End data +* Removes remaining library indexing adapters +* Trims low quality base tails from ends of reads +* Removes too-short reads In more detail merging is where the same read from the forward and reverse files of a single library (based on the flowcell coordinates), are compared to find a stretch of sequence that are the same. If this overlap reaches certain quality thresholds, the two reads are 'collapsed' into a single read, with the base quality scores are updated accordingly accounting for the increase quality call precision. @@ -284,14 +284,14 @@ You will receive output for each FASTQ file supplied for single end data, or for These stacked bars plots are unfortunately a little confusing, when displayed in MultiQC. However are relatively straight-forward once you understand each category. They can be displayed as counts of reads per AdapterRemoval read-category, or as percentages of the same values. Each forward(/reverse) file combination are displayed once. -The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar _includes_ the other categories displayed (see below) in the calculation. +The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar *includes* the other categories displayed (see below) in the calculation. Other Categories: -- If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. -- If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. -- If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. -- **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria. +* If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. +* If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. +* If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. +* **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria.

@@ -305,11 +305,11 @@ If you see high numbers of discarded or truncated reads, you should check your F The length distribution plots show the number of reads at each read-length. You can change the plot to display different categories. -- All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. -- **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing -- **Singleton** represent those reads that had a one member of a pair discarded -- **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. -- **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to. +* All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. +* **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing +* **Singleton** represent those reads that had a one member of a pair discarded +* **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. +* **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to.

@@ -323,7 +323,7 @@ With paired-end ancient DNA sequencing runs You expect to see a slight increase This module provides information on mapping when running the Bowtie2 aligner. Bowtie2, like bwa, takes raw FASTQ reads and finds the most likely place on the reference genome it derived from. While this module is somewhat redundant with the [Samtools](#samtools) (which reports mapping statistics for bwa) and the endorSp.y endogenous DNA value in the general statistics table, it does provide some details that could be useful in certain contexts. -You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Single/Paired-end alignments @@ -343,7 +343,7 @@ The main additional useful information compared to [Samtools](#samtools) is that MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. -You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Metagenomic Mappability @@ -355,7 +355,7 @@ Due to low 'endogenous' content of aDNA, and the high biodiversity of modern or

-This can also be influenced by the type of database you supplied — many databases have an over-abundance of taxa of clinical or economic interest, so when you have a large amount of uncharacterised environmental taxa, this may also result in low mappability. + This can also be influenced by the type of database you supplied — many databases have an over-abundance of taxa of clinical or economic interest, so when you have a large amount of uncharacterised environmental taxa, this may also result in low mappability. #### Taxonomic assignment success @@ -378,7 +378,7 @@ Kraken is another metagenomic classifier, but takes a different approach to alig It is useful when you do not have large computing power or you want very rapid but rough approximation of the metagenomic profile of your sample. -You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Top Taxa @@ -396,7 +396,7 @@ However for screening for specific metagenomic profiles, such as ancient microbi This module provides numbers in raw counts of the mapping of your DNA reads to your reference genome. -You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Flagstat Plot @@ -416,7 +416,7 @@ The remaining rows will be 0 when running `bwa aln` as these characteristics of ### DeDup -You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Background @@ -426,15 +426,15 @@ DeDup is a duplicate removal tool which searches for PCR duplicates and removes This stacked bar plot shows as a whole the total number of reads in the BAM file going into DeDup. The different sections of a given bar represents the following: -- **Not Removed** — the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). -- **Reverse Removed** — the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). -- **Forward Removed** — the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). -- **Merged Removed** — the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). - +* **Not Removed** — the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). +* **Reverse Removed** — the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). +* **Forward Removed** — the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). +* **Merged Removed** — the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). + Exceptions to the above: -- If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. -- If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections. +* If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. +* If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections.

@@ -442,8 +442,8 @@ Exceptions to the above: Things to look out for: -- The smaller the number of the duplicates removed the better. If you have a small number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. -- If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. +* The smaller the number of the duplicates removed the better. If you have a small number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. +* If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. ### Picard @@ -463,8 +463,8 @@ The amount of unmapped reads will depend on whether you have filtered out unmapp Things to look out for: -- The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. -- If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, a badly preserved sample with a very low yield, or a lot of left-over adapters that were able to map to your genome. +* The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. +* If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, a badly preserved sample with a very low yield, or a lot of left-over adapters that were able to map to your genome. ### Preseq @@ -476,7 +476,7 @@ There are two algorithms from the tools we use: `c_curve` and `lc_extrap`. The f Due to endogenous DNA being so low when doing initial screening, the maths behind `lc_extrap` often fails as there is not enough data. Therefore nf-core/eager sticks with `c_curve` which gives a similar approximation of the library complexity, but is more robust to smaller datasets. -You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Complexity Curve @@ -490,9 +490,9 @@ The dashed line represents a 'perfect' library containing only unique molecules Plateauing can be caused by a number of reasons: -- You have simply sequenced your library to exhaustion -- You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio -- You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) +* You have simply sequenced your library to exhaustion +* You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio +* You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) ### DamageProfiler @@ -502,24 +502,24 @@ DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics f Therefore, three main characteristics of ancient DNA are: -- Short DNA fragments -- Elevated G and As (purines) just before strand breaks -- Increased C and Ts at ends of fragments - -You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +* Short DNA fragments +* Elevated G and As (purines) just before strand breaks +* Increased C and Ts at ends of fragments +You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. + #### Misincorporation Plots The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. When looking at the misincorporation plots, keep the following in mind: -- As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. -- If your library has been **partially-UDG treated**, only the first one or two bases will display the misincorporation frequency. -- If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. -- If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. -- We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A. -- The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable. +* As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. +* If your library has been **partially-UDG treated**, only the first one or two bases will display the misincorporation frequency. +* If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. +* If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. +* We generally expect that the older the sample, or the less-ideal preservational environment (hot/wet) the greater the frequency of C to T/G to A. +* The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable.

@@ -533,9 +533,9 @@ The MultiQC DamageProfiler module length distribution plots show the frequency o When looking at the length distribution plots, keep in mind the following: -- Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. -- You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA -- You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. +* Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. +* You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA +* You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. ### QualiMap @@ -547,7 +547,7 @@ Qualimap is a tool which provides statistics on the quality of the mapping of yo Note that many of the statistics from this module are displayed in the General Stats table (see above), as they represent single values that are not plottable. -You will receive output for each _sample_. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). +You will receive output for each *sample*. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). :warning: If your library has no reads mapping to the reference, this will result in an empty BAM file. Qualimap will therefore not produce any output even if a BAM exists! @@ -563,9 +563,9 @@ The greater the number of bases covered at as high as possible fold coverage, th Things to watch out for: -- You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. -- If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. - +* You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. +* If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. + #### Cumulative Genome Coverage This plot shows how much of the genome in percentage (X axis) is covered by a given fold depth coverage (Y axis). @@ -586,9 +586,9 @@ This plot shows the distribution of the frequency of reads at different GC conte Things to watch out for: -- This plot should normally show a normal distribution around the average GC content of your reference genome. -- Bimodal peaks may represent lab-based artefacts that should be further investigated. -- Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The nf-core/eager trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. +* This plot should normally show a normal distribution around the average GC content of your reference genome. +* Bimodal peaks may represent lab-based artefacts that should be further investigated. +* Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The nf-core/eager trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. ### Sex.DetERRmine @@ -658,7 +658,7 @@ This table shows the contents of the `snpStatistics.tsv` file produced by MultiV #### Call statistics barplot -You can get different variants of the call statistics bar plot, depending on how you configured the MultiVCFAnalyzer options. +You can get different variants of the call statistics bar plot, depending on how you configured the MultiVCFAnalyzer options. If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to two different values (left panel A in the figure below), this allows you to assess the number of multi-allelic positions that were called in your genome. Typically MultiVCFAnalyzer is used for analysing smallish haploid genomes (such as mitochondrial or bacterial genomes), therefore a position with multiple possible 'alleles' suggests some form of cross-mapping from other taxa or presence of multiple strains. If this is the case, you will need to be careful with downstream analysis of the consensus sequence (e.g. for phylogenetic tree analysis) as you may accidentally pick up SNPs from other taxa/strains — particularly when dealing with low coverage data. Therefore if you have a high level of 'het' values (see image), you should carefully check your alignments manually to see how clean your genomes are, or whether you can do some form of strain separation (e.g. by majority/minority calling). @@ -670,40 +670,40 @@ If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to the s ## Output Files -This section gives a brief summary of where to look for what files for downstream analysis. This covers _all_ modules. +This section gives a brief summary of where to look for what files for downstream analysis. This covers *all* modules. Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. -- `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. - - When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. -- `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. -- `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. -- `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. -- `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). -- `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. -- `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. -- `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. -- `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. -- `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. -- `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. -- `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. -- `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). -- `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. -- `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. -- `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. -- `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. -- `nuclear_contamination/`: this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. -- `bedtools/`: this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). -- `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. -- `metagenomic_classification/`: this contains the output for a given metagenomic classifier. - - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). _Kmer duplication is defined as: number of kmers / number of unique kmers_. You will find two kraken reports formats available: - - the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) - - the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. - - finally, the `*.kraken.out` file are the direct output of Kraken2 - - ⚠️ If your sample has no hits, no kraken output files will be created for that sample! -- `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) -- `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. - `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. -- `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) -- `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied. +* `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. + * When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. +* `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. +* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. +* `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. +* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). +* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. +* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. +* `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. +* `preseq/`: this contains a `.preseq` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. +* `qualimap/`: this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. +* `damageprofiler/`: this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. +* `pmdtools/`: this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. +* `trimmed_bam/`: this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). +* `damage_rescaling/`: this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. +* `genotyping/`: this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. +* `multivcfanalyzer/`: this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. +* `sex_determination/`: this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. +* `nuclear_contamination/`: this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. +* `bedtools/`: this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). +* `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. +* `metagenomic_classification/`: this contains the output for a given metagenomic classifier. + * Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. + * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: + * the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) + * the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. + * finally, the `*.kraken.out` file are the direct output of Kraken2 + * ⚠️ If your sample has no hits, no kraken output files will be created for that sample! +* `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) +* `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. + `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. +* `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) +* `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied. diff --git a/docs/usage.md b/docs/usage.md index 683dacfba..454b10a93 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -36,7 +36,8 @@ results # Finished results (configurable, see below) # Other Nextflow hidden files, eg. history of pipeline runs and old logs. ``` -To see the the nf-core/eager pipeline help message run: `nextflow run nf-core/eager --help` +To see the the nf-core/eager pipeline help message run: `nextflow run +nf-core/eager --help` If you want to configure your pipeline interactively using a graphical user interface, please visit [nf-co.re @@ -91,30 +92,30 @@ They are loaded in sequence, so later profiles can overwrite earlier profiles. If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. -- `docker` - - A generic configuration profile to be used with [Docker](https://docker.com/) - - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -- `singularity` - - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) - - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -- `podman` - - A generic configuration profile to be used with [Podman](https://podman.io/) - - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -- `shifter` - - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -- `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) - - Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) -- `conda` - - Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. - - A generic configuration profile to be used with [Conda](https://conda.io/docs/) - - Pulls most software from [Bioconda](https://bioconda.github.io/) -- `test` - - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters - -> _Important_: If running nf-core/eager on a cluster - ask your system +* `docker` + * A generic configuration profile to be used with [Docker](https://docker.com/) + * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +* `singularity` + * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +* `podman` + * A generic configuration profile to be used with [Podman](https://podman.io/) + * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +* `shifter` + * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +* `charliecloud` + * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + * Pulls software from Docker Hub: [`nfcore/eager`](https://hub.docker.com/r/nfcore/eager/) +* `conda` + * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + * A generic configuration profile to be used with [Conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) +* `test` + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters + +> *Important*: If running nf-core/eager on a cluster - ask your system > administrator what profile to use. **Institution Specific Profiles** These are profiles specific to certain **HPC @@ -123,17 +124,17 @@ clusters**, and are centrally maintained at regular users of nf-core/eager, if you don't see your own institution here check the [nf-core/configs](https://github.com/nf-core/configs) repository. -- `uzh` - - A profile for the University of Zurich Research Cloud - - Loads Singularity and defines appropriate resources for running the +* `uzh` + * A profile for the University of Zurich Research Cloud + * Loads Singularity and defines appropriate resources for running the pipeline. -- `binac` - - A profile for the BinAC cluster at the University of Tuebingen 0 Loads +* `binac` + * A profile for the BinAC cluster at the University of Tuebingen 0 Loads Singularity and defines appropriate resources for running the pipeline -- `shh` - - A profile for the S/CDAG cluster at the Department of Archaeogenetics of +* `shh` + * A profile for the S/CDAG cluster at the Department of Archaeogenetics of the Max Planck Institute for the Science of Human History - - Loads Singularity and defines appropriate resources for running the pipeline + * Loads Singularity and defines appropriate resources for running the pipeline **Pipeline Specific Institution Profiles** There are also pipeline-specific institution profiles. I.e., we can also offer a profile which sets special @@ -144,10 +145,10 @@ pipelines. This can be seen at We currently offer a nf-core/eager specific profile for -- `shh` - - A profiler for the S/CDAG cluster at the Department of Archaeogenetics of +* `shh` + * A profiler for the S/CDAG cluster at the Department of Archaeogenetics of the Max Planck Institute for the Science of Human History - - In addition to the nf-core wide profile, this also sets the MALT resources + * In addition to the nf-core wide profile, this also sets the MALT resources to match our commonly used databases Further institutions can be added at @@ -258,7 +259,7 @@ There are two possible ways of supplying input sequencing data to nf-core/eager. This method is where you specify with `--input`, the path locations of FASTQ (optionally gzipped) or BAM file(s). This option is mutually exclusive to the [TSV input method](#tsv-input-method), which is used for more complex input configurations such as lane and library merging. -When using the direct method of `--input` you can specify one or multiple samples in one or more directories files. File names **must be unique**, even if in different directories. +When using the direct method of `--input` you can specify one or multiple samples in one or more directories files. File names **must be unique**, even if in different directories. By default, the pipeline _assumes_ you have paired-end data. If you want to run single-end data you must specify [`--single_end`]('#single_end') @@ -284,7 +285,7 @@ If you have multiple files in different directories, you can use additional wild 4. When using the pipeline with **paired end data**, the path must use `{1,2}` notation to specify read pairs. 5. Files names must be unique, having files with the same name, but in different directories is _not_ sufficient - - This can happen when a library has been sequenced across two sequencers on the same lane. Either rename the file, try a symlink with a unique name, or merge the two FASTQ files prior input. + * This can happen when a library has been sequenced across two sequencers on the same lane. Either rename the file, try a symlink with a unique name, or merge the two FASTQ files prior input. 6. Due to limitations of downstream tools (e.g. FastQC), sample IDs may be truncated after the first `.` in the name, Ensure file names are unique prior to this! 7. For input BAM files you should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. @@ -302,8 +303,8 @@ The use of the TSV `--input` method is recommended when performing more complex This TSV should look like the following: -| Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | -| ----------- | ---------- | ---- | ---------------- | ------- | -------- | ------------ | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | +| Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | +|-------------|------------|------|------------------|--------|----------|--------------|---------------|----|----|-----| | JK2782 | JK2782 | 1 | 4 | PE | Mammoth | double | full | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz) | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz) | NA | | JK2802 | JK2802 | 2 | 2 | SE | Mammoth | double | full | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R1_001.fastq.gz.tengrand.fq.gz) | [https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz](https://github.com/nf-core/test-datasets/raw/eager/testdata/Mammoth/fastq/JK2802_AGAATAACCTACCA_L008_R2_001.fastq.gz.tengrand.fq.gz) | NA | @@ -316,22 +317,22 @@ When using TSV_input, nf-core/eager will merge FASTQ files of libraries with the Column descriptions are as follows: -- **Sample_Name:** A text string containing the name of a given sample of which there can be multiple libraries. All libraries with the same sample name and same SeqType will be merged after deduplication. -- **Library_ID:** A text string containing a given library, which there can be multiple sequencing lanes (with the same SeqType). -- **Lane:** A number indicating which lane the library was sequenced on. Files from the libraries sequenced on different lanes (and different SeqType) will be concatenated after read clipping and merging. -- **Colour Chemistry** A number indicating whether the Illumina sequencer the library was sequenced on was a 2 (e.g. Next/NovaSeq) or 4 (Hi/MiSeq) colour chemistry machine. This informs whether poly-G trimming (if turned on) should be performed. -- **SeqType:** A text string of either 'PE' or 'SE', specifying paired end (with both an R1 [or forward] and R2 [or reverse]) and single end data (only R1 [forward], or BAM). This will affect lane merging if different per library. -- **Organism:** A text string of the organism name of the sample or 'NA'. This currently has no functionality and can be set to 'NA', but will affect lane/library merging if different per library -- **Strandedness:** A text string indicating whether the library type is'single' or 'double'. This will affect lane/library merging if different per library. -- **UDG_Treatment:** A text string indicating whether the library was generated with UDG treatment - either 'full', 'half' or 'none'. Will affect lane/library merging if different per library. -- **R1:** A text string of a file path pointing to a forward or R1 FASTQ file. This can be used with the R2 column. File names **must be unique**, even if they are in different directories. -- **R2:** A text string of a file path pointing to a reverse or R2 FASTQ file, or 'NA' when single end data. This can be used with the R1 column. File names **must be unique**, even if they are in different directories. -- **BAM:** A text string of a file path pointing to a BAM file, or 'NA'. Cannot be specified at the same time as R1 or R2, both of which should be set to 'NA' +* **Sample_Name:** A text string containing the name of a given sample of which there can be multiple libraries. All libraries with the same sample name and same SeqType will be merged after deduplication. +* **Library_ID:** A text string containing a given library, which there can be multiple sequencing lanes (with the same SeqType). +* **Lane:** A number indicating which lane the library was sequenced on. Files from the libraries sequenced on different lanes (and different SeqType) will be concatenated after read clipping and merging. +* **Colour Chemistry** A number indicating whether the Illumina sequencer the library was sequenced on was a 2 (e.g. Next/NovaSeq) or 4 (Hi/MiSeq) colour chemistry machine. This informs whether poly-G trimming (if turned on) should be performed. +* **SeqType:** A text string of either 'PE' or 'SE', specifying paired end (with both an R1 [or forward] and R2 [or reverse]) and single end data (only R1 [forward], or BAM). This will affect lane merging if different per library. +* **Organism:** A text string of the organism name of the sample or 'NA'. This currently has no functionality and can be set to 'NA', but will affect lane/library merging if different per library +* **Strandedness:** A text string indicating whether the library type is'single' or 'double'. This will affect lane/library merging if different per library. +* **UDG_Treatment:** A text string indicating whether the library was generated with UDG treatment - either 'full', 'half' or 'none'. Will affect lane/library merging if different per library. +* **R1:** A text string of a file path pointing to a forward or R1 FASTQ file. This can be used with the R2 column. File names **must be unique**, even if they are in different directories. +* **R2:** A text string of a file path pointing to a reverse or R2 FASTQ file, or 'NA' when single end data. This can be used with the R1 column. File names **must be unique**, even if they are in different directories. +* **BAM:** A text string of a file path pointing to a BAM file, or 'NA'. Cannot be specified at the same time as R1 or R2, both of which should be set to 'NA' For example, the following TSV table: | Sample_Name | Library_ID | Lane | Colour_Chemistry | SeqType | Organism | Strandedness | UDG_Treatment | R1 | R2 | BAM | -| ----------- | ---------- | ---- | ---------------- | ------- | -------- | ------------ | ------------- | -------------------------------------------------------------- | -------------------------------------------------------------- | --- | +|-------------|------------|------|------------------|---------|----------|--------------|---------------|----------------------------------------------------------------|----------------------------------------------------------------|-----| | JK2782 | JK2782 | 7 | 4 | PE | Mammoth | double | full | data/JK2782_TGGCCGATCAACGA_L007_R1_001.fastq.gz.tengrand.fq.gz | data/JK2782_TGGCCGATCAACGA_L007_R2_001.fastq.gz.tengrand.fq.gz | NA | | JK2782 | JK2782 | 8 | 4 | PE | Mammoth | double | full | data/JK2782_TGGCCGATCAACGA_L008_R1_001.fastq.gz.tengrand.fq.gz | data/JK2782_TGGCCGATCAACGA_L008_R2_001.fastq.gz.tengrand.fq.gz | NA | | JK2802 | JK2802 | 7 | 4 | PE | Mammoth | double | full | data/JK2802_AGAATAACCTACCA_L007_R1_001.fastq.gz.tengrand.fq.gz | data/JK2802_AGAATAACCTACCA_L007_R2_001.fastq.gz.tengrand.fq.gz | NA | @@ -339,35 +340,35 @@ For example, the following TSV table: will have the following effects: -- After AdapterRemoval, and prior to mapping, FASTQ files from lane 7 and lane 8 _with the same `SeqType`_ (and all other _metadata_ columns) will be concatenated together for each **Library**. -- After mapping, and prior BAM filtering, BAM files with different `SeqType` (but with all other metadata columns the same) will be merged together for each **Library**. -- After duplicate removal, BAM files with different `Library_ID`s but with the same `Sample_Name` and the same `UDG_Treatment` will be merged together. -- If BAM trimming is turned on, all post-trimming BAMs (i.e. non-UDG and half-UDG ) will be merged with UDG-treated (untreated) BAMs, if they have the same `Sample_Name`. +* After AdapterRemoval, and prior to mapping, FASTQ files from lane 7 and lane 8 _with the same `SeqType`_ (and all other _metadata_ columns) will be concatenated together for each **Library**. +* After mapping, and prior BAM filtering, BAM files with different `SeqType` (but with all other metadata columns the same) will be merged together for each **Library**. +* After duplicate removal, BAM files with different `Library_ID`s but with the same `Sample_Name` and the same `UDG_Treatment` will be merged together. +* If BAM trimming is turned on, all post-trimming BAMs (i.e. non-UDG and half-UDG ) will be merged with UDG-treated (untreated) BAMs, if they have the same `Sample_Name`. Note the following important points and limitations for setting up: -- The TSV must use actual tabs (not spaces) between cells. -- The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. -- _File_ names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). - - At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. - - Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. - - If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. -- Lane IDs must be unique for each sequencing of each library. - - If you have a library sequenced e.g. on Lane 8 of two HiSeq runs, you can give a fake lane ID (e.g. 20) for one of the FASTQs, and the libraries will still be processed correctly. - - This also applies to the SeqType column, i.e. with the example above, if one run is PE and one run is SE, you need to give fake lane IDs to one of the runs as well. -- All _BAM_ files must be specified as `SE` under `SeqType`. - - You should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. -- nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration -- Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together. -- nf-core/eager is able to correctly handle libraries that are sequenced multiple times on different sequencing configurations (i.e mixtures of single- and paired-end data). These will be merged after mapping and considered 'paired-end' during downstream processes. - - **Important** we do not recommend choosing to use DeDup (i.e. `--dedupper 'dedup'`) when mixing PE and SE data, as SE data will not necessarily have the correct end position of the read, and DeDup requires both ends of the molecule to remove a duplicate read. Therefore you may end up with inflated (false-positive) coverages due to suboptimal deduplication. - - When you wish to run PE/SE data together, the default `-dedupper markduplicates` is therefore preferred, as it only looks at the first position. While more conservative (i.e. it'll remove more reads even if not technically duplicates, because it assumes it can't see the true ends of molecules), it is more consistent. - - An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`. - - If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager - - If you _regularly_ want to run the situation above, please leave a feature request on github. -- DamageProfiler, NuclearContamination, MTtoNucRatio and PreSeq are performed on each unique library separately after deduplication (but prior same-treated library merging). -- nf-core/eager functionality such as `--run_trim_bam` will be applied to only non-UDG (UDG_Treatment: none) or half-UDG (UDG_Treatment: half) libraries. - Qualimap is run on each sample, after merging of libraries (i.e. your values will reflect the values of all libraries combined - after being damage trimmed etc.). -- Genotyping will be typically performed on each `sample` independently, as normally all libraries will have been merged together. However, if you have a mixture of single-stranded and double-stranded libraries, you will normally need to genotype separately. In this case you **must** give each the SS and DS libraries _distinct_ `Sample_IDs`; otherwise you will receive a `file collision` error in steps such as `sexdeterrmine`, and then you will need to merge these yourself. We will consider changing this behaviour in the future if there is enough interest. +* The TSV must use actual tabs (not spaces) between cells. +* The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. +* *File* names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). + * At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. + * Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. + * If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. +* Lane IDs must be unique for each sequencing of each library. + * If you have a library sequenced e.g. on Lane 8 of two HiSeq runs, you can give a fake lane ID (e.g. 20) for one of the FASTQs, and the libraries will still be processed correctly. + * This also applies to the SeqType column, i.e. with the example above, if one run is PE and one run is SE, you need to give fake lane IDs to one of the runs as well. +* All _BAM_ files must be specified as `SE` under `SeqType`. + * You should provide a small decoy reference genome with pre-made indices, e.g. the human mtDNA or phiX genome, for the mandatory parameter `--fasta` in order to avoid long computational time for generating the index files of the reference genome, even if you do not actually need a reference genome for any downstream analyses. +* nf-core/eager will only merge multiple _lanes_ of sequencing runs with the same single-end or paired-end configuration +* Accordingly nf-core/eager will not merge _lanes_ of FASTQs with BAM files (unless you use `--run_convertbam`), as only FASTQ files are lane-merged together. +* nf-core/eager is able to correctly handle libraries that are sequenced multiple times on different sequencing configurations (i.e mixtures of single- and paired-end data). These will be merged after mapping and considered 'paired-end' during downstream processes. + * **Important** we do not recommend choosing to use DeDup (i.e. `--dedupper 'dedup'`) when mixing PE and SE data, as SE data will not necessarily have the correct end position of the read, and DeDup requires both ends of the molecule to remove a duplicate read. Therefore you may end up with inflated (false-positive) coverages due to suboptimal deduplication. + * When you wish to run PE/SE data together, the default `-dedupper markduplicates` is therefore preferred, as it only looks at the first position. While more conservative (i.e. it'll remove more reads even if not technically duplicates, because it assumes it can't see the true ends of molecules), it is more consistent. + * An error will be thrown if you try to merge both PE and SE and also supply `--skip_merging`. + * If you truly want to mix SE data and PE data but using mate-pair info for PE mapping, please run FASTQ preprocessing mapping manually and supply BAM files for downstream processing by nf-core/eager + * If you _regularly_ want to run the situation above, please leave a feature request on github. +* DamageProfiler, NuclearContamination, MTtoNucRatio and PreSeq are performed on each unique library separately after deduplication (but prior same-treated library merging). +* nf-core/eager functionality such as `--run_trim_bam` will be applied to only non-UDG (UDG_Treatment: none) or half-UDG (UDG_Treatment: half) libraries. - Qualimap is run on each sample, after merging of libraries (i.e. your values will reflect the values of all libraries combined - after being damage trimmed etc.). +* Genotyping will be typically performed on each `sample` independently, as normally all libraries will have been merged together. However, if you have a mixture of single-stranded and double-stranded libraries, you will normally need to genotype separately. In this case you **must** give each the SS and DS libraries _distinct_ `Sample_IDs`; otherwise you will receive a `file collision` error in steps such as `sexdeterrmine`, and then you will need to merge these yourself. We will consider changing this behaviour in the future if there is enough interest. ## Clean up @@ -407,7 +408,7 @@ hard drive footprint of the run, so be sure to do this! When using TSV input, nf-core/eager will attempt to merge all `Lanes` of a `Library_ID`, or all files with the same `Library_ID` or `Sample_ID`. However, -if you have specified the same `Lane` or `Library_ID` for two sets of FASTQ +if you have specified the same `Lane` or `Library_ID` for two sets of FASTQ files you will likely receive an error such as ```bash @@ -429,7 +430,7 @@ In some cases it maybe no output log is produced by a particular tool for MultiQ Known cases include: -- Qualimap: there will be no MultiQC output if the BAM file is empty. An empty BAM file is produced when no reads map to the reference and causes Qualimap to crash - this is crash is ignored by nf-core/eager (to allow the rest of the pipeline to continue) and will therefore have no log file for that particular sample/library +* Qualimap: there will be no MultiQC output if the BAM file is empty. An empty BAM file is produced when no reads map to the reference and causes Qualimap to crash - this is crash is ignored by nf-core/eager (to allow the rest of the pipeline to continue) and will therefore have no log file for that particular sample/library ## Tutorials @@ -546,10 +547,10 @@ If you change into this with `cd` and run `ls -la` you should see a collection of normal files, symbolic links (symlinks) and hidden files (indicated with `.` at the beginning of the file name). -- Symbolic links: are typically input files from previous processes. -- Normal files: are typically successfully completed output files from some of +* Symbolic links: are typically input files from previous processes. +* Normal files: are typically successfully completed output files from some of some of the commands in the process -- Hidden files are Nextflow generated files and include the submission commands +* Hidden files are Nextflow generated files and include the submission commands as well as log files When you have an error run, you can firstly check the contents of the output @@ -564,7 +565,9 @@ screen if you were running the command/program yourself. Again, view these with e.g. `cat` and see if you can identify the error of the program itself. Finally, you can also try running the commands _yourself_. You can firstly try -to do this by loading your given nf-core/eager environment (e.g. `singularity shell /\/\/nf-core-eager-X-X-X.img` or `conda activate nf-core-eager-X.X.X`), then running `bash .command.sh`. +to do this by loading your given nf-core/eager environment (e.g. `singularity +shell /\/\/nf-core-eager-X-X-X.img` or `conda activate +nf-core-eager-X.X.X`), then running `bash .command.sh`. If this doesn't work, this suggests either there is something wrong with the nf-core/eager environment configuration, _or_ there is still a problem with the @@ -583,7 +586,7 @@ the #eager channel). #### Tutorial Profiles - Background -A useful feature of Nextflow is the ability to use configuration _profiles_ that +A useful feature of Nextflow is the ability to use configuration *profiles* that can specify many default parameters and other settings on how to run your pipeline. @@ -604,9 +607,9 @@ DNA to map and cause false positive SNP calls. Within nf-core, there are two main levels of configs -- Institutional-level profiles: these normally define things like paths to +* Institutional-level profiles: these normally define things like paths to common storage, resource maximums, scheduling system -- Pipeline-level profiles: these normally define parameters specifically for a +* Pipeline-level profiles: these normally define parameters specifically for a pipeline (such as mapping parameters, turning specific modules on or off) As well as allowing more efficiency and control at cluster or Institutional @@ -614,7 +617,7 @@ levels in terms of memory usage, pipeline-level profiles can also assist in facilitating reproducible science by giving a way for researchers to 'publish' their exact pipeline parameters in way other users can automatically re-run the pipeline with the pipeline parameters used in the original publication but on -their _own_ cluster. +their *own* cluster. To illustrate this, lets say we analysed our data on a HPC called 'blue' for which an institutional profile already exists, and for our analysis we defined a @@ -664,11 +667,11 @@ This would be translated as follows. If your parameters looked like the following -| Parameter | Resolved Parameters | institution | cluster | my_paper | -| ------------ | ------------------- | ----------- | -------- | -------- | -| --executor | singularity | singularity | \ | \ | -| --max_memory | 256GB | 756GB | 256GB | \ | -| --bwa_aln | 0.1 | \ | 0.01 | 0.1 | +| Parameter | Resolved Parameters | institution | cluster | my_paper | +| ----------------|------------------------|-------------|----------|----------| +| --executor | singularity | singularity | \ | \ | +| --max_memory | 256GB | 756GB | 256GB | \ | +| --bwa_aln | 0.1 | \ | 0.01 | 0.1 | (where '\' is a parameter not defined in a given profile.) @@ -686,7 +689,7 @@ defined in the `cluster` profile. > institutional-level profiles. Otherwise please skip to [Writing your own profile](#tutorial-profiles---writing-your-own-profile) In actuality, a nf-core/eager run already contains many configs and profiles, -and will normally use _multiple_ configs profiles in a single run. Multiple +and will normally use *multiple* configs profiles in a single run. Multiple configuration and profiles files can be used, and each new one selected will inherit all the previous one's parameters, and the parameters in the new one will then overwrite any that have been changed from the original. @@ -724,7 +727,7 @@ nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna < In the background, any parameters in the pipeline's `nextflow.config` (containing default parameters) will be overwritten by the -`old_dna_profile.config`. In addition, the `old_dna` _profile_ will overwrite +`old_dna_profile.config`. In addition, the `old_dna` *profile* will overwrite any parameters set in the config but outside the profile definition of `old_dna_profile.config`. @@ -744,13 +747,13 @@ the `hpc_blue` profile, but the `mapper` parameter has been changed from The order of loading of different configuration files can be seen here: | Loading Order | Configuration File | -| ------------: | :-------------------------------------------------------------------------------------------------------------- | -| 1 | `nextflow.config` in your current directory | -| 2 | (if using a script for `nextflow run`) a `nextflow.config` in the directory the script is located | -| 3 | `config` stored in your human directory under `~/.nextflow/` | -| 4 | `.config` if you specify in the `nextflow run` command with `-c` | -| 5 | general nf-core institutional configurations stored at [nf-core/configs](https://github.com/nf-core/configs) | -| 6 | pipeline-specific nf-core institutional configurations at [nf-core/configs](https://github.com/nf-core/configs) | +| -------------:|:----------------------------------------------------------------------------------------------------------------| +| 1 | `nextflow.config` in your current directory | +| 2 | (if using a script for `nextflow run`) a `nextflow.config` in the directory the script is located | +| 3 | `config` stored in your human directory under `~/.nextflow/` | +| 4 | `.config` if you specify in the `nextflow run` command with `-c` | +| 5 | general nf-core institutional configurations stored at [nf-core/configs](https://github.com/nf-core/configs) | +| 6 | pipeline-specific nf-core institutional configurations at [nf-core/configs](https://github.com/nf-core/configs) | This loading order of these `.config` files will not normally affect the settings you use for the pipeline run itself; `-profiles` are normally more @@ -761,7 +764,7 @@ if your run does not use the parameters you expect. > specifying a custom `.config` file by using `-C` (capital C) instead of `-c` > (which inherits previously specify parameters) -Another thing that is important to note is that if a specific _profile_ is +Another thing that is important to note is that if a specific *profile* is specified in `nextflow run`, this replaces any 'global' parameter that is specified within the config file (but outside a profile) itself - **regardless** of profile order (see above). @@ -779,7 +782,7 @@ params { // Specific nf-core/configs params config_profile_contact = 'James Fellows Yates (@jfy133)' config_profile_description = 'nf-core/eager SHH profile provided by nf-core/configs' - + // default BWA bwaalnn = 0.04 bwaalnl = 32 @@ -801,7 +804,8 @@ profiles { ``` If you run with `nextflow run -profile shh` to specify to use an -institutional-level nf-core config, the parameters will be read as `--bwaalnn 0.04` and `--bwaalnl 32` as these are the default 'fall back' params as +institutional-level nf-core config, the parameters will be read as `--bwaalnn +0.04` and `--bwaalnl 32` as these are the default 'fall back' params as indicated in the example above. If you specify as `nextflow run -profile shh,pathogen_loose`, as expected @@ -1463,59 +1467,59 @@ For example, I normally look for things like: General Stats Table: -- Do I see the expected number of raw sequencing reads (summed across each set +* Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -- Does the percentage of trimmed reads look normal for aDNA, and do lengths +* Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -- Does ClusterFactor or 'Dups' look high (e.g. >2 or >10% respectively) +* Does ClusterFactor or 'Dups' look high (e.g. >2 or >10% respectively) suggesting over-amplified or badly preserved samples? -- Do the mapped reads show increased frequency of C>Ts on the 5' end of +* Do the mapped reads show increased frequency of C>Ts on the 5' end of molecules? -- Is the number of SNPs used for nuclear contamination really low for any +* Is the number of SNPs used for nuclear contamination really low for any individuals (e.g. < 100)? If so, then the estimates might not be very accurate. FastQC (pre-AdapterRemoval): -- Do I see any very early drop off of sequence quality scores suggesting a +* Do I see any very early drop off of sequence quality scores suggesting a problematic sequencing run? -- Do I see outlier GC content distributions? -- Do I see high sequence duplication levels? +* Do I see outlier GC content distributions? +* Do I see high sequence duplication levels? AdapterRemoval: -- Do I see high numbers of singletons or discarded read pairs? +* Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -- Do I see improved sequence quality scores along the length of reads? -- Do I see reduced adapter content levels? +* Do I see improved sequence quality scores along the length of reads? +* Do I see reduced adapter content levels? Samtools Flagstat (pre/post Filter): -- Do I see outliers, e.g. with unusually high levels of human DNA, (indicative +* Do I see outliers, e.g. with unusually high levels of human DNA, (indicative of contamination) that require downstream closer assessment? Are your samples exceptionally preserved? If not, a value higher than e.g. 50% might require your attention. DeDup/Picard MarkDuplicates: -- Do I see large numbers of duplicates being removed, possibly indicating +* Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? DamageProfiler: -- Do I see evidence of damage on human DNA? - - High numbers of mapped reads but no damage may indicate significant +* Do I see evidence of damage on human DNA? + * High numbers of mapped reads but no damage may indicate significant modern contamination. - - Was the read trimming I specified enough to overcome damage effects? + * Was the read trimming I specified enough to overcome damage effects? SexDetERRmine: -- Do the relative coverages on the X and Y chromosome fall within the expected +* Do the relative coverages on the X and Y chromosome fall within the expected areas of the plot? -- Do all individuals have enough data for accurate sex determination? -- Do the proportions of autosomal/X/Y reads make sense? If there is an +* Do all individuals have enough data for accurate sex determination? +* Do the proportions of autosomal/X/Y reads make sense? If there is an overrepresentation of reads within one bin, is the data enriched for that bin? > Detailed documentation and descriptions for all MultiQC modules can be seen in @@ -1615,7 +1619,7 @@ Prior setting up an nf-core/eager run for metagenomic screening, we will need: We should also ensure we have the very latest version of the nf-core/eager pipeline so we have all latest bugfixes etc. In this case we will be using nf-core/eager version 2.2.0. You should always check on the -[nf-core](https://nf-co.re/eager) website whether a newer release has been made +[nf-core](https://nf-co.re/eager) website whether a newer release has been made (particularly point releases e.g. 2.2.1). ```bash @@ -1906,58 +1910,58 @@ For example, I normally look for things like: General Stats Table: -- Do I see the expected number of raw sequencing reads (summed across each set +* Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -- Does the percentage of trimmed reads look normal for aDNA, and do lengths +* Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -- Does ClusterFactor or 'Dups' look high suggesting over-amplified or +* Does ClusterFactor or 'Dups' look high suggesting over-amplified or badly preserved samples (e.g. >2 or >10% respectively - however given this is on the human reads this is just a rule of thumb and may not reflect the quality of the metagenomic profile) ? -- Does the human DNA show increased frequency of C>Ts on the 5' end of +* Does the human DNA show increased frequency of C>Ts on the 5' end of molecules? FastQC (pre-AdapterRemoval): -- Do I see any very early drop off of sequence quality scores suggesting +* Do I see any very early drop off of sequence quality scores suggesting problematic sequencing run? -- Do I see outlier GC content distributions? -- Do I see high sequence duplication levels? +* Do I see outlier GC content distributions? +* Do I see high sequence duplication levels? AdapterRemoval: -- Do I see high numbers of singletons or discarded read pairs? +* Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -- Do I see improved sequence quality scores along the length of reads? -- Do I see reduced adapter content levels? +* Do I see improved sequence quality scores along the length of reads? +* Do I see reduced adapter content levels? MALT: -- Do I have a reasonable level of mappability? - - Somewhere between 10-30% can be pretty normal for aDNA, whereas e.g. <1% +* Do I have a reasonable level of mappability? + * Somewhere between 10-30% can be pretty normal for aDNA, whereas e.g. <1% requires careful manual assessment -- Do I have a reasonable taxonomic assignment success? - - You hope to have a large number of the mapped reads (from the mappability +* Do I have a reasonable taxonomic assignment success? + * You hope to have a large number of the mapped reads (from the mappability plot) that also have taxonomic assignment. Samtools Flagstat (pre/post Filter): -- Do I see outliers, e.g. with unusually high levels of human DNA, (indicative +* Do I see outliers, e.g. with unusually high levels of human DNA, (indicative of contamination) that require downstream closer assessment? DeDup/Picard MarkDuplicates: -- Do I see large numbers of duplicates being removed, possibly indicating +* Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? DamageProfiler: -- Do I see evidence of damage on human DNA? Note this is just a +* Do I see evidence of damage on human DNA? Note this is just a rule-of-thumb/corroboration of any signals you might find in the metagenomic screening and not essential. - - If you have high numbers of human DNA reads but no damage may indicate + * If you have high numbers of human DNA reads but no damage may indicate significant modern contamination. > Detailed documentation and descriptions for all MultiQC modules can be seen in @@ -2082,7 +2086,7 @@ Prior setting up the nf-core/eager run, we will need: We should also ensure we have the very latest version of the nf-core/eager pipeline so we have all latest bugfixes etc. In this case we will be using nf-core/eager version 2.2.0. You should always check on the -[nf-core](https://nf-co.re/eager) website whether a newer release has been made +[nf-core](https://nf-co.re/eager) website whether a newer release has been made (particularly point releases e.g. 2.2.1). ```bash @@ -2528,80 +2532,80 @@ results. For example, I normally look for things like: General Stats Table: -- Do I see the expected number of raw sequencing reads (summed across each set +* Do I see the expected number of raw sequencing reads (summed across each set of FASTQ files per library) that was requested for sequencing? -- Does the percentage of trimmed reads look normal for aDNA, and do lengths +* Does the percentage of trimmed reads look normal for aDNA, and do lengths after trimming look short as expected of aDNA? -- Does the Endogenous DNA (%) columns look reasonable (high enough to indicate +* Does the Endogenous DNA (%) columns look reasonable (high enough to indicate you have received enough coverage for downstream, and/or do you lose an unusually high reads after filtering ) -- Does ClusterFactor or '% Dups' look high (e.g. >2 or >10% respectively - high +* Does ClusterFactor or '% Dups' look high (e.g. >2 or >10% respectively - high values suggesting over-amplified or badly preserved samples i.e. low complexity; note that genome-enrichment libraries may by their nature look higher). -- Do you see an increased frequency of C>Ts on the 5' end of molecules in the +* Do you see an increased frequency of C>Ts on the 5' end of molecules in the mapped reads? -- Do median read lengths look relatively low (normally <= 100 bp) indicating +* Do median read lengths look relatively low (normally <= 100 bp) indicating typically fragmented aDNA? -- Does the % coverage decrease relatively gradually at each depth coverage, and +* Does the % coverage decrease relatively gradually at each depth coverage, and does not drop extremely drastically -- Does the Median coverage and percent >3x (or whatever you set) show sufficient +* Does the Median coverage and percent >3x (or whatever you set) show sufficient coverage for reliable SNP calls and that a good proportion of the genome is covered indicating you have the right reference genome? -- Do you see a high proportion of % Hets, indicating many multi-allelic sites +* Do you see a high proportion of % Hets, indicating many multi-allelic sites (and possibly presence of cross-mapping from other species, that may lead to false positive or less confident SNP calls)? FastQC (pre-AdapterRemoval): -- Do I see any very early drop off of sequence quality scores suggesting +* Do I see any very early drop off of sequence quality scores suggesting problematic sequencing run? -- Do I see outlier GC content distributions? -- Do I see high sequence duplication levels? +* Do I see outlier GC content distributions? +* Do I see high sequence duplication levels? AdapterRemoval: -- Do I see high numbers of singletons or discarded read pairs? +* Do I see high numbers of singletons or discarded read pairs? FastQC (post-AdapterRemoval): -- Do I see improved sequence quality scores along the length of reads? -- Do I see reduced adapter content levels? +* Do I see improved sequence quality scores along the length of reads? +* Do I see reduced adapter content levels? Samtools Flagstat (pre/post Filter): -- Do I see outliers, e.g. with unusually low levels of mapped reads, (indicative +* Do I see outliers, e.g. with unusually low levels of mapped reads, (indicative of badly preserved samples) that require downstream closer assessment? DeDup/Picard MarkDuplicates: -- Do I see large numbers of duplicates being removed, possibly indicating +* Do I see large numbers of duplicates being removed, possibly indicating over-amplified or badly preserved samples? PreSeq: -- Do I see a large drop off of a sample's curve away from the theoretical +* Do I see a large drop off of a sample's curve away from the theoretical complexity? If so, this may indicate it's not worth performing deeper sequencing as you will get few unique reads (vs. duplicates that are not any more informative than the reads you've already sequenced) DamageProfiler: -- Do I see evidence of damage on the microbial DNA (i.e. a % C>T of more than ~5% in +* Do I see evidence of damage on the microbial DNA (i.e. a % C>T of more than ~5% in the first few nucleotide positions?) ? If not, possibly your mapped reads are deriving from modern contamination. QualiMap: -- Do you see a peak of coverage (X) at a good level, e.g. >= 3x, indicating +* Do you see a peak of coverage (X) at a good level, e.g. >= 3x, indicating sufficient coverage for reliable SNP calls? MultiVCFAnalyzer: -- Do I have a good number of called SNPs that suggest the samples have genomes +* Do I have a good number of called SNPs that suggest the samples have genomes with sufficient nucleotide diversity to inform phylogenetic analysis? -- Do you have a large number of discarded SNP calls? -- Are the % Hets very high indicating possible cross-mapping from off-target +* Do you have a large number of discarded SNP calls? +* Are the % Hets very high indicating possible cross-mapping from off-target organisms that may confounding variant calling? > Detailed documentation and descriptions for all MultiQC modules can be seen in diff --git a/nextflow_schema.json b/nextflow_schema.json index be3132b2e..ca9f4fe6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,1690 +1,1784 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", - "title": "nf-core/eager pipeline parameters", - "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data, and additional metadata.", - "required": ["input"], - "properties": { - "input": { - "type": "string", - "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", - "fa_icon": "fas fa-dna", - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." - }, - "udg_type": { - "type": "string", - "default": "none", - "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", - "fa_icon": "fas fa-vial", - "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", - "enum": ["none", "half", "full"] - }, - "single_stranded": { - "type": "boolean", - "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", - "fa_icon": "fas fa-minus", - "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" - }, - "single_end": { - "type": "boolean", - "description": "Specifies that the input is single end reads. Not required for TSV input.", - "fa_icon": "fas fa-align-left", - "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" - }, - "colour_chemistry": { - "type": "integer", - "default": 4, - "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", - "fa_icon": "fas fa-palette", - "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." - }, - "bam": { - "type": "boolean", - "description": "Specifies that the input is in BAM format. Not required for TSV input.", - "fa_icon": "fas fa-align-justify", - "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" - } - }, - "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." - }, - "input_data_additional_options": { - "title": "Input Data Additional Options", - "type": "object", - "description": "Additional options regarding input data.", - "default": "", - "properties": { - "snpcapture_bed": { - "type": "string", - "fa_icon": "fas fa-magnet", - "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome.", - "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, on-target metrics are automatically generated for you by qualimap." - }, - "run_convertinputbam": { - "type": "boolean", - "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", - "fa_icon": "fas fa-undo-alt", - "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." - } - }, - "fa_icon": "far fa-plus-square" - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "properties": { - "fasta": { - "type": "string", - "fa_icon": "fas fa-font", - "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", - "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" - }, - "genome": { - "type": "string", - "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", - "fa_icon": "fas fa-book", - "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" - }, - "igenomes_base": { - "type": "string", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - }, - "bwa_index": { - "type": "string", - "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", - "fa_icon": "fas fa-address-book", - "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "bt2_index": { - "type": "string", - "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", - "fa_icon": "far fa-address-book", - "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." - }, - "fasta_index": { - "type": "string", - "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", - "fa_icon": "far fa-bookmark", - "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" - }, - "seq_dict": { - "type": "string", - "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", - "fa_icon": "fas fa-spell-check", - "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" - }, - "large_ref": { - "type": "boolean", - "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", - "fa_icon": "fas fa-mountain", - "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" - }, - "save_reference": { - "type": "boolean", - "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", - "fa_icon": "far fa-save", - "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" - } - }, - "description": "Specify locations of references and optionally, additional pre-made indices", - "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." - }, - "output_options": { - "title": "Output options", - "type": "object", - "description": "Specify where to put output files and optional saving of intermediate files", - "default": "", - "properties": { - "outdir": { - "type": "string", - "description": "The output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open", - "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "hidden": true, - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"] - } - }, - "fa_icon": "fas fa-cloud-download-alt" - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "hidden": true, - "fa_icon": "fas fa-question-circle" - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "An email address to send a summary email to when the pipeline is completed.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "hidden": true, - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true, - "help_text": "Set to receive plain-text e-mails instead of HTML formatted." - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true, - "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true, - "help_text": "Set to disable colourful command line output and live life in monochrome." - }, - "multiqc_config": { - "type": "string", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, - "show_hidden_params": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "hidden": true, - "description": "Parameter used for checking conda channels to be set correctly." - }, - "schema_ignore_params": { - "type": "string", - "fa_icon": "fas fa-not-equal", - "description": "String to specify ignored parameters for parameter validation", - "hidden": true, - "default": "genomes" - } - }, - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } - }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog", - "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", - "fa_icon": "fas fa-users-cog" - }, - "hostnames": { - "type": "string", - "description": "Institutional configs hostname.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "awsqueue": { - "type": "string", - "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", - "fa_icon": "fab fa-aws" - }, - "awsregion": { - "type": "string", - "default": "eu-west-1", - "description": "The AWS Region for your AWS Batch job to run on", - "fa_icon": "fab fa-aws" - }, - "awscli": { - "type": "string", - "description": "Path to the AWS CLI tool", - "fa_icon": "fab fa-aws" - } - } - }, - "skip_steps": { - "title": "Skip steps", - "type": "object", - "description": "Skip any of the mentioned steps.", - "default": "", - "properties": { - "skip_fastqc": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." - }, - "skip_adapterremoval": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." - }, - "skip_preseq": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the computation of library complexity estimation." - }, - "skip_deduplication": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" - }, - "skip_damage_calculation": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" - }, - "skip_qualimap": { - "type": "boolean", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" - } - }, - "fa_icon": "fas fa-fast-forward", - "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." - }, - "complexity_filtering": { - "title": "Complexity filtering", - "type": "object", - "description": "Processing of Illumina two-colour chemistry data.", - "default": "", - "properties": { - "complexity_filter_poly_g": { - "type": "boolean", - "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", - "fa_icon": "fas fa-power-off", - "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" - }, - "complexity_filter_poly_g_min": { - "type": "integer", - "default": 10, - "description": "Specify length of poly-g min for clipping to be performed.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" - } - }, - "fa_icon": "fas fa-filter", - "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" - }, - "read_merging_and_adapter_removal": { - "title": "Read merging and adapter removal", - "type": "object", - "description": "Options for adapter clipping and paired-end merging.", - "default": "", - "properties": { - "clip_forward_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", - "description": "Specify adapter sequence to be clipped off (forward strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" - }, - "clip_reverse_adaptor": { - "type": "string", - "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", - "description": "Specify adapter sequence to be clipped off (reverse strand).", - "fa_icon": "fas fa-cut", - "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" - }, - "clip_adapters_list": { - "type": "string", - "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", - "fa_icon": "fas fa-cut", - "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" - }, - "clip_readlength": { - "type": "integer", - "default": 30, - "description": "Specify read minimum length to be kept for downstream analysis.", - "fa_icon": "fas fa-ruler", - "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" - }, - "clip_min_read_quality": { - "type": "integer", - "default": 20, - "description": "Specify minimum base quality for trimming off bases.", - "fa_icon": "fas fa-medal", - "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" - }, - "min_adap_overlap": { - "type": "integer", - "default": 1, - "description": "Specify minimum adapter overlap required for clipping.", - "fa_icon": "fas fa-hands-helping", - "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" - }, - "skip_collapse": { - "type": "boolean", - "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" - }, - "skip_trim": { - "type": "boolean", - "description": "Skip adapter and quality trimming.", - "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" - }, - "preserve5p": { - "type": "boolean", - "description": "Skip quality base trimming (n, score, window) of 5 prime end.", - "fa_icon": "fas fa-life-ring", - "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" - }, - "mergedonly": { - "type": "boolean", - "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", - "fa_icon": "fas fa-handshake", - "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." - }, - "qualitymax": { - "type": "integer", - "description": "Specify the maximum Phred score used in input FASTQ files", - "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", - "default": 41, - "fa_icon": "fas fa-arrow-up" - }, - "run_post_ar_trimming": { - "type": "boolean", - "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", - "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" - }, - "post_ar_trim_front": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of a merged read or R1", - "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" - }, - "post_ar_trim_tail": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of of a merged read or R1", - "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" - }, - "post_ar_trim_front2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the front of R2", - "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" - }, - "post_ar_trim_tail2": { - "type": "integer", - "default": 7, - "description": "Specify the number of bases to trim off the tail of R2", - "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" - } - }, - "fa_icon": "fas fa-cut", - "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." - }, - "mapping": { - "title": "Read mapping to reference genome", - "type": "object", - "description": "Options for reference-genome mapping", - "default": "", - "properties": { - "mapper": { - "title": "Mapper", - "type": "string", - "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", - "default": "bwaaln", - "fa_icon": "fas fa-layer-group", - "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", - "enum": ["bwaaln", "bwamem", "circularmapper", "bowtie2"] - }, - "bwaalnn": { - "type": "number", - "default": 0.01, - "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" - }, - "bwaalnk": { - "type": "integer", - "default": 2, - "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", - "fa_icon": "fas fa-drafting-compass", - "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" - }, - "bwaalnl": { - "type": "integer", - "default": 1024, - "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" - }, - "bwaalno": { - "type": "integer", - "default": 2, - "fa_icon": "fas fa-people-arrows", - "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", - "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" - }, - "circularextension": { - "type": "integer", - "default": 500, - "description": "Specify the number of bases to extend reference by (circularmapper only).", - "fa_icon": "fas fa-external-link-alt", - "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" - }, - "circulartarget": { - "type": "string", - "default": "MT", - "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", - "fa_icon": "fas fa-bullseye", - "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" - }, - "circularfilter": { - "type": "boolean", - "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", - "fa_icon": "fas fa-filter", - "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" - }, - "bt2_alignmode": { - "type": "string", - "default": "local", - "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": ["local", "end-to-end"] - }, - "bt2_sensitivity": { - "type": "string", - "default": "sensitive", - "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", - "fa_icon": "fas fa-microscope", - "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", - "enum": [ - "no-preset", - "very-fast", - "fast", - "sensitive", - "very-sensitive" - ] - }, - "bt2n": { - "type": "integer", - "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-sort-numeric-down", - "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", - "default": 0 - }, - "bt2l": { - "type": "integer", - "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", - "default": 0 - }, - "bt2_trim5": { - "type": "integer", - "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", - "default": 0 - }, - "bt2_trim3": { - "type": "integer", - "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", - "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", - "default": 0 - }, - "bt2_maxins": { - "type": "integer", - "default": 500, - "fa_icon": "fas fa-exchange-alt", - "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", - "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" - } - }, - "fa_icon": "fas fa-layer-group", - "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" - }, - "host_removal": { - "title": "Removal of Host-Mapped Reads", - "type": "object", - "description": "Options for production of host-read removed FASTQ files for privacy reasons.", - "default": "", - "properties": { - "hostremoval_input_fastq": { - "type": "boolean", - "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", - "fa_icon": "fas fa-power-off", - "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" - }, - "hostremoval_mode": { - "type": "string", - "default": "remove", - "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", - "fa_icon": "fas fa-mask", - "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", - "enum": ["strip", "replace", "remove"] - } - }, - "fa_icon": "fas fa-user-shield", - "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." - }, - "bam_filtering": { - "title": "BAM Filtering", - "type": "object", - "description": "Options for quality filtering and how to deal with off-target unmapped reads.", - "default": "", - "properties": { - "run_bam_filtering": { - "type": "boolean", - "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" - }, - "bam_mapping_quality_threshold": { - "type": "integer", - "description": "Minimum mapping quality for reads filter.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", - "default": 0 - }, - "bam_filter_minreadlength": { - "type": "integer", - "fa_icon": "fas fa-ruler-horizontal", - "description": "Specify minimum read length to be kept after mapping.", - "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", - "default": 0 - }, - "bam_unmapped_type": { - "type": "string", - "default": "discard", - "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", - "fa_icon": "fas fa-trash-alt", - "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", - "enum": ["discard", "keep", "bam", "fastq", "both"] - } - }, - "fa_icon": "fas fa-sort-amount-down", - "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" - }, - "deduplication": { - "title": "DeDuplication", - "type": "object", - "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", - "default": "", - "properties": { - "dedupper": { - "type": "string", - "default": "markduplicates", - "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", - "fa_icon": "fas fa-object-group", - "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", - "enum": ["markduplicates", "dedup"] - }, - "dedup_all_merged": { - "type": "boolean", - "description": "Turn on treating all reads as merged reads.", - "fa_icon": "fas fa-handshake", - "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" - } - }, - "fa_icon": "fas fa-clone", - "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." - }, - "library_complexity_analysis": { - "title": "Library Complexity Analysis", - "type": "object", - "description": "Options for calculating library complexity (i.e. how many unique reads are present).", - "default": "", - "properties": { - "preseq_mode": { - "type": "string", - "default": "c_curve", - "description": "Specify which mode of preseq to run.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", - "enum": ["c_curve", "lc_extrap"] - }, - "preseq_step_size": { - "type": "integer", - "default": 1000, - "description": "Specify the step size of Preseq.", - "fa_icon": "fas fa-shoe-prints", - "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" - }, - "preseq_maxextrap": { - "type": "integer", - "default": 10000000000, - "description": "Specify the maximum extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-ban", - "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" - }, - "preseq_terms": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" - }, - "preseq_bootstrap": { - "type": "integer", - "default": 100, - "description": "Specify number of bootstraps to perform (lc_extrap mode only)", - "fa_icon": "fab fa-bootstrap", - "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" - }, - "preseq_cval": { - "type": "number", - "default": 0.95, - "description": "Specify confidence interval level (lc_extrap mode only)", - "fa_icon": "fas fa-check-circle", - "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" - } - }, - "fa_icon": "fas fa-bezier-curve", - "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." - }, - "adna_damage_analysis": { - "title": "(aDNA) Damage Analysis", - "type": "object", - "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", - "default": "", - "properties": { - "damageprofiler_length": { - "type": "integer", - "default": 100, - "description": "Specify length filter for DamageProfiler.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" - }, - "damageprofiler_threshold": { - "type": "integer", - "default": 15, - "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", - "fa_icon": "fas fa-ruler-horizontal", - "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" - }, - "damageprofiler_yaxis": { - "type": "number", - "default": 0.3, - "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", - "fa_icon": "fas fa-ruler-vertical", - "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" - }, - "run_pmdtools": { - "type": "boolean", - "description": "Turn on PMDtools", - "fa_icon": "fas fa-power-off", - "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" - }, - "pmdtools_range": { - "type": "integer", - "default": 10, - "description": "Specify range of bases for PMDTools to scan for damage.", - "fa_icon": "fas fa-arrows-alt-h", - "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" - }, - "pmdtools_threshold": { - "type": "integer", - "default": 3, - "description": "Specify PMDScore threshold for PMDTools.", - "fa_icon": "fas fa-chart-bar", - "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" - }, - "pmdtools_reference_mask": { - "type": "string", - "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", - "fa_icon": "fas fa-mask", - "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." - }, - "pmdtools_max_reads": { - "type": "integer", - "default": 10000, - "description": "Specify the maximum number of reads to consider for metrics generation.", - "fa_icon": "fas fa-greater-than-equal", - "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" - }, - "pmdtools_platypus": { - "type": "boolean", - "description": "Append big list of base frequencies for platypus to output.", - "fa_icon": "fas fa-power-off", - "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" - }, - "run_mapdamage_rescaling": { - "type": "boolean", - "fa_icon": "fas fa-map", - "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" - }, - "rescale_length_5p": { - "type": "integer", - "default": 12, - "fa_icon": "fas fa-balance-scale-right", - "description": "Length of read for mapDamage2 to rescale from 5p end.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." - }, - "rescale_length_3p": { - "type": "integer", - "default": 12, - "fa_icon": "fas fa-balance-scale-left", - "description": "Length of read for mapDamage2 to rescale from 3p end.", - "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." - } - }, - "fa_icon": "fas fa-chart-line", - "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" - }, - "feature_annotation_statistics": { - "title": "Feature Annotation Statistics", - "type": "object", - "description": "Options for getting reference annotation statistics (e.g. gene coverages)", - "default": "", - "properties": { - "run_bedtools_coverage": { - "type": "boolean", - "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", - "fa_icon": "fas fa-chart-area", - "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" - }, - "anno_file": { - "type": "string", - "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", - "fa_icon": "fas fa-file-signature", - "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" - } - }, - "fa_icon": "fas fa-scroll", - "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" - }, - "bam_trimming": { - "title": "BAM Trimming", - "type": "object", - "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", - "default": "", - "properties": { - "run_trim_bam": { - "type": "boolean", - "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." - }, - "bamutils_clip_double_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_double_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_half_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_left": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler-combined", - "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_clip_single_stranded_none_udg_right": { - "type": "integer", - "default": 0, - "fa_icon": "fas fa-ruler", - "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", - "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" - }, - "bamutils_softclip": { - "type": "boolean", - "description": "Turn on using softclip instead of hard masking.", - "fa_icon": "fas fa-paint-roller", - "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" - } - }, - "fa_icon": "fas fa-eraser", - "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" - }, - "genotyping": { - "title": "Genotyping", - "type": "object", - "description": "Options for variant calling.", - "default": "", - "properties": { - "run_genotyping": { - "type": "boolean", - "description": "Turn on genotyping of BAM files.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." - }, - "genotyping_tool": { - "type": "string", - "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", - "fa_icon": "fas fa-tools", - "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", - "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"] - }, - "genotyping_source": { - "type": "string", - "default": "raw", - "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", - "fa_icon": "fas fa-faucet", - "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", - "enum": ["raw", "pmd", "trimmed", "rescaled"] - }, - "gatk_call_conf": { - "type": "integer", - "default": 30, - "description": "Specify GATK phred-scaled confidence threshold.", - "fa_icon": "fas fa-balance-scale-right", - "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" - }, - "gatk_ploidy": { - "type": "integer", - "default": 2, - "description": "Specify GATK organism ploidy.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" - }, - "gatk_downsample": { - "type": "integer", - "default": 250, - "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", - "fa_icon": "fas fa-icicles", - "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" - }, - "gatk_dbsnp": { - "type": "string", - "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", - "fa_icon": "fas fa-marker", - "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" - }, - "gatk_hc_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", - "enum": [ - "EMIT_ALL_ACTIVE_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_hc_emitrefconf": { - "type": "string", - "default": "GVCF", - "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", - "enum": ["NONE", "GVCF", "BP_RESOLUTION"] - }, - "gatk_ug_out_mode": { - "type": "string", - "default": "EMIT_VARIANTS_ONLY", - "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", - "fa_icon": "fas fa-bullhorn", - "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", - "enum": [ - "EMIT_ALL_SITES", - "EMIT_ALL_CONFIDENT_SITES", - "EMIT_VARIANTS_ONLY" - ] - }, - "gatk_ug_genotype_model": { - "type": "string", - "default": "SNP", - "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", - "enum": [ - "SNP", - "INDEL", - "BOTH", - "GENERALPLOIDYSNP", - "GENERALPLOIDYINDEL" - ] - }, - "gatk_ug_keep_realign_bam": { - "type": "boolean", - "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "fa_icon": "fas fa-align-left", - "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." - }, - "gatk_ug_defaultbasequalities": { - "type": "string", - "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", - "fa_icon": "fas fa-undo-alt", - "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" - }, - "freebayes_C": { - "type": "integer", - "default": 1, - "description": "Specify minimum required supporting observations to consider a variant.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" - }, - "freebayes_g": { - "type": "integer", - "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", - "fa_icon": "fab fa-think-peaks", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", - "default": 0 - }, - "freebayes_p": { - "type": "integer", - "default": 2, - "description": "Specify ploidy of sample in FreeBayes.", - "fa_icon": "fas fa-pastafarianism", - "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" - }, - "pileupcaller_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for pileupCaller.", - "fa_icon": "fas fa-bed", - "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" - }, - "pileupcaller_snpfile": { - "type": "string", - "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", - "fa_icon": "fas fa-sliders-h", - "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" - }, - "pileupcaller_method": { - "type": "string", - "default": "randomHaploid", - "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", - "fa_icon": "fas fa-toolbox", - "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", - "enum": ["randomHaploid", "randomDiploid", "majorityCall"] - }, - "pileupcaller_transitions_mode": { - "type": "string", - "default": "AllSites", - "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", - "fa_icon": "fas fa-toggle-on", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", - "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"] - }, - "pileupcaller_min_map_quality": { - "type": "integer", - "default": 30, - "description": "The minimum mapping quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." - }, - "pileupcaller_min_base_quality": { - "type": "integer", - "default": 30, - "description": "The minimum base quality to be used for genotyping.", - "fa_icon": "fas fa-filter", - "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." - }, - "angsd_glmodel": { - "type": "string", - "default": "samtools", - "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", - "fa_icon": "fas fa-project-diagram", - "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", - "enum": ["samtools", "gatk", "soapsnp", "syk"] - }, - "angsd_glformat": { - "type": "string", - "default": "binary", - "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", - "fa_icon": "fas fa-text-height", - "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", - "enum": ["text", "binary", "binary_three", "beagle"] - }, - "angsd_createfasta": { - "type": "boolean", - "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", - "fa_icon": "fas fa-align-justify", - "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" - }, - "angsd_fastamethod": { - "type": "string", - "default": "random", - "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", - "fa_icon": "fas fa-toolbox", - "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", - "enum": ["random", "common"] - }, - "run_bcftools_stats": { - "type": "boolean", - "default": true, - "description": "Turn on bcftools stats generation for VCF based variant calling statistics", - "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", - "fa_icon": "far fa-chart-bar" - } - }, - "fa_icon": "fas fa-sliders-h", - "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." - }, - "consensus_sequence_generation": { - "title": "Consensus Sequence Generation", - "type": "object", - "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", - "default": "", - "properties": { - "run_vcf2genome": { - "type": "boolean", - "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" - }, - "vcf2genome_outfile": { - "type": "string", - "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", - "fa_icon": "fas fa-file-alt", - "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" - }, - "vcf2genome_header": { - "type": "string", - "description": "Specify the header name of the consensus sequence entry within the FASTA file.", - "fa_icon": "fas fa-heading", - "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" - }, - "vcf2genome_minc": { - "type": "integer", - "default": 5, - "description": "Minimum depth coverage required for a call to be included (else N will be called).", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" - }, - "vcf2genome_minq": { - "type": "integer", - "default": 30, - "description": "Minimum genotyping quality of a call to be called. Else N will be called.", - "fa_icon": "fas fa-medal", - "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" - }, - "vcf2genome_minfreq": { - "type": "number", - "default": 0.8, - "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", - "fa_icon": "fas fa-percent", - "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/eager/master/nextflow_schema.json", + "title": "nf-core/eager pipeline parameters", + "description": "A fully reproducible and state-of-the-art ancient DNA analysis pipeline", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data, and additional metadata.", + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string", + "description": "Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). For paired end data, the path must use '{1,2}' notation to specify read pairs. Alternatively, a path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template.", + "fa_icon": "fas fa-dna", + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager. The most efficient but more simplistic is supplying direct paths (with wildcards) to your FASTQ or BAM files, with each file or pair being considered a single library and each one run independently (e.g. for paired-end data: `--input '///*_{R1,R2}_*.fq.gz'`). TSV input requires creation of an extra file by the user (`--input '///eager_data.tsv'`) and extra metadata, but allows more powerful lane and library merging. Please see [usage docs](https://nf-co.re/eager/docs/usage#input-specifications) for detailed instructions and specifications." + }, + "udg_type": { + "type": "string", + "default": "none", + "description": "Specifies whether you have UDG treated libraries. Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input.", + "fa_icon": "fas fa-vial", + "help_text": "Defines whether Uracil-DNA glycosylase (UDG) treatment was used to remove DNA\ndamage on the sequencing libraries.\n\nSpecify `'none'` if no treatment was performed. If you have partial UDG treated\ndata ([Rohland et al 2016](http://dx.doi.org/10.1098/rstb.2013.0624)), specify\n`'half'`. If you have complete UDG treated data ([Briggs et al.\n2010](https://doi.org/10.1093/nar/gkp1163)), specify `'full'`. \n\nWhen also using PMDtools specifying `'half'` will use a different model for DNA\ndamage assessment in PMDTools (PMDtools: `--UDGhalf`). Specify `'full'` and the\nPMDtools DNA damage assessment will use CpG context only (PMDtools: `--CpG`).\nDefault: `'none'`.\n\n> **Tip**: You should provide a small decoy reference genome with pre-made indices, e.g.\n> the human mtDNA genome, for the mandatory parameter `--fasta` in order to\n> avoid long computational time for generating the index files of the reference\n> genome, even if you do not actually need a reference genome for any downstream\n> analyses.", + "enum": [ + "none", + "half", + "full" + ] + }, + "single_stranded": { + "type": "boolean", + "description": "Specifies that libraries are single stranded. Always affects MALTExtract but will be ignored by pileupCaller with TSV input. Not required for TSV input.", + "fa_icon": "fas fa-minus", + "help_text": "Indicates libraries are single stranded.\n\nCurrently only affects MALTExtract where it will switch on damage patterns\ncalculation mode to single-stranded, (MaltExtract: `--singleStranded`) and\ngenotyping with pileupCaller where a different method is used (pileupCaller:\n`--singleStrandMode`). Default: false\n\nOnly required when using the 'Path' method of `--input`" + }, + "single_end": { + "type": "boolean", + "description": "Specifies that the input is single end reads. Not required for TSV input.", + "fa_icon": "fas fa-align-left", + "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, specify this parameter on the command line when you launch the pipeline. It is not possible to run a mixture of single-end and paired-end files in one run.\n\nOnly required when using the 'Path' method of `--input`" + }, + "colour_chemistry": { + "type": "integer", + "default": 4, + "description": "Specifies which Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4.", + "fa_icon": "fas fa-palette", + "help_text": "Specifies which Illumina colour chemistry a library was sequenced with. This informs whether to perform poly-G trimming (if `--complexity_filter_poly_g` is also supplied). Only 2 colour chemistry sequencers (e.g. NextSeq or NovaSeq) can generate uncertain poly-G tails (due to 'G' being indicated via a no-colour detection). Default is '4' to indicate e.g. HiSeq or MiSeq platforms, which do not require poly-G trimming. Options: 2, 4. Default: 4\n\nOnly required when using the 'Path' method of input." + }, + "bam": { + "type": "boolean", + "description": "Specifies that the input is in BAM format. Not required for TSV input.", + "fa_icon": "fas fa-align-justify", + "help_text": "Specifies the input file type to `--input` is in BAM format. This will automatically also apply `--single_end`.\n\nOnly required when using the 'Path' method of `--input`.\n" + } + }, + "help_text": "There are two possible ways of supplying input sequencing data to nf-core/eager.\nThe most efficient but more simplistic is supplying direct paths (with\nwildcards) to your FASTQ or BAM files, with each file or pair being considered a\nsingle library and each one run independently. TSV input requires creation of an\nextra file by the user and extra metadata, but allows more powerful lane and\nlibrary merging." + }, + "input_data_additional_options": { + "title": "Input Data Additional Options", + "type": "object", + "description": "Additional options regarding input data.", + "default": "", + "properties": { + "snpcapture_bed": { + "type": "string", + "fa_icon": "fas fa-magnet", + "description": "If library result of SNP capture, path to BED file containing SNPS positions on reference genome.", + "help_text": "Can be used to set a path to a BED file (3/6 column format) of SNP positions of a reference genome, to calculate SNP captured libraries on-target efficiency. This should be used for array or in-solution SNP capture protocols such as 390K, 1240K, etc. If supplied, on-target metrics are automatically generated for you by qualimap." + }, + "run_convertinputbam": { + "type": "boolean", + "description": "Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.).", + "fa_icon": "fas fa-undo-alt", + "help_text": "Allows you to convert an input BAM file back to FASTQ for downstream processing. Note this is required if you need to perform AdapterRemoval and/or polyG clipping.\n\nIf not turned on, BAMs will automatically be sent to post-mapping steps." + } + }, + "fa_icon": "far fa-plus-square" + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "properties": { + "fasta": { + "type": "string", + "fa_icon": "fas fa-font", + "description": "Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta'.", + "help_text": "You specify the full path to your reference genome here. The FASTA file can have any file suffix, such as `.fasta`, `.fna`, `.fa`, `.FastA` etc. You may also supply a gzipped reference files, which will be unzipped automatically for you.\n\nFor example:\n\n```bash\n--fasta '///my_reference.fasta'\n```\n\n> If you don't specify appropriate `--bwa_index`, `--fasta_index` parameters, the pipeline will create these indices for you automatically. Note that you can save the indices created for you for later by giving the `--save_reference` flag.\n> You must select either a `--fasta` or `--genome`\n" + }, + "genome": { + "type": "string", + "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", + "fa_icon": "fas fa-book", + "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" + }, + "igenomes_base": { + "type": "string", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + }, + "bwa_index": { + "type": "string", + "description": "Path to directory containing pre-made BWA indices (i.e. the directory before the files ending in '.amb' '.ann' '.bwt'. Do not include the files themselves. Most likely the same directory of the file provided with --fasta). If not supplied will be made for you.", + "fa_icon": "fas fa-address-book", + "help_text": "If you want to use pre-existing `bwa index` indices, please supply the **directory** to the FASTA you also specified in `--fasta` nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bwa` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bwa_index/BWAIndex/'\n```\n\n> `bwa index` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "bt2_index": { + "type": "string", + "description": "Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you.", + "fa_icon": "far fa-address-book", + "help_text": "If you want to use pre-existing `bt2 index` indices, please supply the **directory** to the FASTA you also specified in `--fasta`. nf-core/eager will automagically detect the index files by searching for the FASTA filename with the corresponding `bt2` index file suffixes.\n\nFor example:\n\n```bash\nnextflow run nf-core/eager \\\n-profile test,docker \\\n--input '*{R1,R2}*.fq.gz'\n--fasta 'results/reference_genome/bwa_index/BWAIndex/Mammoth_MT_Krause.fasta' \\\n--bwa_index 'results/reference_genome/bt2_index/BT2Index/'\n```\n\n> `bowtie2-build` does not give you an option to supply alternative suffixes/names for these indices. Thus, the file names generated by this command _must not_ be changed, otherwise nf-core/eager will not be able to find them." + }, + "fasta_index": { + "type": "string", + "description": "Path to samtools FASTA index (typically ending in '.fai'). If not supplied will be made for you.", + "fa_icon": "far fa-bookmark", + "help_text": "If you want to use a pre-existing `samtools faidx` index, use this to specify the required FASTA index file for the selected reference genome. This should be generated by `samtools faidx` and has a file suffix of `.fai`\n\nFor example:\n\n```bash\n--fasta_index 'Mammoth_MT_Krause.fasta.fai'\n```" + }, + "seq_dict": { + "type": "string", + "description": "Path to picard sequence dictionary file (typically ending in '.dict'). If not supplied will be made for you.", + "fa_icon": "fas fa-spell-check", + "help_text": "If you want to use a pre-existing `picard CreateSequenceDictionary` dictionary file, use this to specify the required `.dict` file for the selected reference genome.\n\nFor example:\n\n```bash\n--seq_dict 'Mammoth_MT_Krause.dict'\n```" + }, + "large_ref": { + "type": "boolean", + "description": "Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'.", + "fa_icon": "fas fa-mountain", + "help_text": "This parameter is required to be set for large reference genomes. If your\nreference genome is larger than 3.5GB, the `samtools index` calls in the\npipeline need to generate `CSI` indices instead of `BAI` indices to compensate\nfor the size of the reference genome (with samtools: `-c`). This parameter is\nnot required for smaller references (including the human `hg19` or\n`grch37`/`grch38` references), but `>4GB` genomes have been shown to need `CSI`\nindices. Default: off" + }, + "save_reference": { + "type": "boolean", + "description": "If not already supplied by user, turns on saving of generated reference genome indices for later re-usage.", + "fa_icon": "far fa-save", + "help_text": "Use this if you do not have pre-made reference FASTA indices for `bwa`, `samtools` and `picard`. If you turn this on, the indices nf-core/eager generates for you and will be saved in the `/results/reference_genomes` for you. If not supplied, nf-core/eager generated index references will be deleted.\n\n> modifies SAMtools index command: `-c`" + } + }, + "description": "Specify locations of references and optionally, additional pre-made indices", + "help_text": "All nf-core/eager runs require a reference genome in FASTA format to map reads\nagainst to.\n\nIn addition we provide various options for indexing of different types of\nreference genomes (based on the tools used in the pipeline). nf-core/eager can\nindex reference genomes for you (with options to save these for other analysis),\nbut you can also supply your pre-made indices.\n\nSupplying pre-made indices saves time in pipeline execution and is especially\nadvised when running multiple times on the same cluster system for example. You\ncan even add a resource [specific profile](#profile) that sets paths to\npre-computed reference genomes, saving time when specifying these.\n\n> :warning: you must always supply a reference file. If you want to use\n functionality that does not require one, supply a small decoy genome such as\n phiX or the human mtDNA genome." + }, + "output_options": { + "title": "Output options", + "type": "object", + "description": "Specify where to put output files and optional saving of intermediate files", + "default": "", + "properties": { + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open", + "help_text": "The output directory where the results will be saved. By default will be made in the directory you run the command in under `./results`." + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "hidden": true, + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ] + } + }, + "fa_icon": "fas fa-cloud-download-alt" + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-question-circle" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "An email address to send a summary email to when the pipeline is completed.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true, + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run if it **fails**. Normally would be the same as in `--email` but can be different. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.\n\n> Note that this functionality requires either `mail` or `sendmail` to be installed on your system." + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true, + "help_text": "Set to receive plain-text e-mails instead of HTML formatted." + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true, + "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true, + "help_text": "Set to disable colourful command line output and live life in monochrome." + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "hidden": true, + "description": "Parameter used for checking conda channels to be set correctly." + }, + "schema_ignore_params": { + "type": "string", + "fa_icon": "fas fa-not-equal", + "description": "String to specify ignored parameters for parameter validation", + "hidden": true, + "default": "genomes" + } + }, + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`." + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These generally should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog", + "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", + "fa_icon": "fas fa-users-cog" + }, + "hostnames": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "awsqueue": { + "type": "string", + "description": "The AWSBatch JobQueue that needs to be set when running on AWSBatch", + "fa_icon": "fab fa-aws" + }, + "awsregion": { + "type": "string", + "default": "eu-west-1", + "description": "The AWS Region for your AWS Batch job to run on", + "fa_icon": "fab fa-aws" + }, + "awscli": { + "type": "string", + "description": "Path to the AWS CLI tool", + "fa_icon": "fab fa-aws" + } + } + }, + "skip_steps": { + "title": "Skip steps", + "type": "object", + "description": "Skip any of the mentioned steps.", + "default": "", + "properties": { + "skip_fastqc": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off FastQC pre- and post-Adapter Removal, to speed up the pipeline. Use of this flag is most common when data has been previously pre-processed and the post-Adapter Removal mapped reads are being re-mapped to a new reference genome." + }, + "skip_adapterremoval": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter trimming and paired-end read merging. Equivalent to setting both `--skip_collapse` and `--skip_trim`." + }, + "skip_preseq": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the computation of library complexity estimation." + }, + "skip_deduplication": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off duplicate removal methods DeDup and MarkDuplicates respectively. No duplicates will be removed on any data in the pipeline.\n" + }, + "skip_damage_calculation": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the DamageProfiler module to compute DNA damage profiles.\n" + }, + "skip_qualimap": { + "type": "boolean", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off QualiMap and thus does not compute coverage and other mapping metrics.\n" + } + }, + "fa_icon": "fas fa-fast-forward", + "help_text": "Some of the steps in the pipeline can be executed optionally. If you specify\nspecific steps to be skipped, there won't be any output related to these\nmodules." + }, + "complexity_filtering": { + "title": "Complexity filtering", + "type": "object", + "description": "Processing of Illumina two-colour chemistry data.", + "default": "", + "properties": { + "complexity_filter_poly_g": { + "type": "boolean", + "description": "Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries.", + "fa_icon": "fas fa-power-off", + "help_text": "Performs a poly-G tail removal step in the beginning of the pipeline using `fastp`, if turned on. This can be useful for trimming ploy-G tails from short-fragments sequenced on two-colour Illumina chemistry such as NextSeqs (where no-fluorescence is read as a G on two-colour chemistry), which can inflate reported GC content values.\n" + }, + "complexity_filter_poly_g_min": { + "type": "integer", + "default": 10, + "description": "Specify length of poly-g min for clipping to be performed.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "This option can be used to define the minimum length of a poly-G tail to begin low complexity trimming. By default, this is set to a value of `10` unless the user has chosen something specifically using this option.\n\n> Modifies fastp parameter: `--poly_g_min_len`" + } + }, + "fa_icon": "fas fa-filter", + "help_text": "More details can be seen in the [fastp\ndocumentation](https://github.com/OpenGene/fastp)\n\nIf using TSV input, this is performed per lane separately" + }, + "read_merging_and_adapter_removal": { + "title": "Read merging and adapter removal", + "type": "object", + "description": "Options for adapter clipping and paired-end merging.", + "default": "", + "properties": { + "clip_forward_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC", + "description": "Specify adapter sequence to be clipped off (forward strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the forward read. By default, this is set to `'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'`.\n\n> Modifies AdapterRemoval parameter: `--adapter1`" + }, + "clip_reverse_adaptor": { + "type": "string", + "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA", + "description": "Specify adapter sequence to be clipped off (reverse strand).", + "fa_icon": "fas fa-cut", + "help_text": "Defines the adapter sequence to be used for the reverse read in paired end sequencing projects. This is set to `'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'` by default.\n\n> Modifies AdapterRemoval parameter: `--adapter2`" + }, + "clip_adapters_list": { + "type": "string", + "description": "Path to AdapterRemoval adapter list file. Overrides `--clip_*_adaptor` parameters", + "fa_icon": "fas fa-cut", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. **Overrides** the `--clip_*_adaptor` parameters . First column represents forward strand, second column for reverse strand. You must supply all possibly combinations, one per line, and this list is applied to all files. See [AdapterRemoval documentation](https://adapterremoval.readthedocs.io/en/latest/manpage.html) for more information.\n\n> Modifies AdapterRemoval parameter: `--adapter-list`" + }, + "clip_readlength": { + "type": "integer", + "default": 30, + "description": "Specify read minimum length to be kept for downstream analysis.", + "fa_icon": "fas fa-ruler", + "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" + }, + "clip_min_read_quality": { + "type": "integer", + "default": 20, + "description": "Specify minimum base quality for trimming off bases.", + "fa_icon": "fas fa-medal", + "help_text": "Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.\n\n> Modifies AdapterRemoval parameter: `--minquality`" + }, + "min_adap_overlap": { + "type": "integer", + "default": 1, + "description": "Specify minimum adapter overlap required for clipping.", + "fa_icon": "fas fa-hands-helping", + "help_text": "Specifies a minimum number of bases that overlap with the adapter sequence before adapters are trimmed from reads. Default is set to `1` base overlap.\n\n> Modifies AdapterRemoval parameter: `--minadapteroverlap`" + }, + "skip_collapse": { + "type": "boolean", + "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" + }, + "skip_trim": { + "type": "boolean", + "description": "Skip adapter and quality trimming.", + "fa_icon": "fas fa-fast-forward", + "help_text": "Turns off adapter AND quality trimming.\n\nFor example:\n\n```bash\n--skip_trim --input '*.fastq'\n```\n\n:warning: it is not possible to keep quality trimming (n or base quality) on,\n_and_ skip adapter trimming.\n\n:warning: it is not possible to turn off one or the other of quality\ntrimming or n trimming. i.e. --trimns --trimqualities are both given\nor neither. However setting quality in `--clip_min_read_quality` to 0 would\ntheoretically turn off base quality trimming.\n\n> Modifies AdapterRemoval parameters: `--trimns --trimqualities --adapter1 --adapter2`" + }, + "preserve5p": { + "type": "boolean", + "description": "Skip quality base trimming (n, score, window) of 5 prime end.", + "fa_icon": "fas fa-life-ring", + "help_text": "Turns off quality based trimming at the 5p end of reads when any of the --trimns, --trimqualities, or --trimwindows options are used. Only 3p end of reads will be removed.\n\nThis also entirely disables quality based trimming of collapsed reads, since both ends of these are informative for PCR duplicate filtering. Described [here](https://github.com/MikkelSchubert/adapterremoval/issues/32#issuecomment-504758137).\n\n> Modifies AdapterRemoval parameters: `--preserve5p`" + }, + "mergedonly": { + "type": "boolean", + "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", + "fa_icon": "fas fa-handshake", + "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." + }, + "qualitymax": { + "type": "integer", + "description": "Specify the maximum Phred score used in input FASTQ files", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "default": 41, + "fa_icon": "fas fa-arrow-up" + }, + "run_post_ar_trimming": { + "type": "boolean", + "description": "Turn on trimming of inline barcodes (i.e. internal barcodes after adapter removal)", + "help_text": "In some cases, you may want to additionally trim reads in a FASTQ file after adapter removal.\n\nThis could be to remove short 'inline' or 'internal' barcodes that are ligated directly onto DNA molecules prior ligation of adapters and indicies (the former of which allow ultra-multiplexing and/or checks for barcode hopping).\n\nIn other cases, you may wish to already remove known high-frequency damage bases to allow stricter mapping.\n\nTurning on this module uses `fastp` to trim one, or both ends of a merged read, or in cases where you have not collapsed your read, R1 and R2.\n" + }, + "post_ar_trim_front": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of a merged read or R1", + "help_text": "Specify the number of bases to trim off the start of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_front1`" + }, + "post_ar_trim_tail": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of of a merged read or R1", + "help_text": "Specify the number of bases to trim off the end of a read in a merged- or forward read FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail1`" + }, + "post_ar_trim_front2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the front of R2", + "help_text": "Specify the number of bases to trim off the start of a read in an unmerged forward read (R1) FASTQ file.\n\n> Modifies fastp parameters: `--trim_front2`" + }, + "post_ar_trim_tail2": { + "type": "integer", + "default": 7, + "description": "Specify the number of bases to trim off the tail of R2", + "help_text": "Specify the number of bases to trim off the end of a read in an unmerged reverse read (R2) FASTQ file.\n\n> Modifies fastp parameters: `--trim_tail2`" + } + }, + "fa_icon": "fas fa-cut", + "help_text": "These options handle various parts of adapter clipping and read merging steps.\n\nMore details can be seen in the [AdapterRemoval\ndocumentation](https://adapterremoval.readthedocs.io/en/latest/)\n\nIf using TSV input, this is performed per lane separately.\n\n> :warning: `--skip_trim` will skip adapter clipping AND quality trimming\n> (n, base quality). It is currently not possible skip one or the other." + }, + "mapping": { + "title": "Read mapping to reference genome", + "type": "object", + "description": "Options for reference-genome mapping", + "default": "", + "properties": { + "mapper": { + "title": "Mapper", + "type": "string", + "description": "Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'.", + "default": "bwaaln", + "fa_icon": "fas fa-layer-group", + "help_text": "Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). BWA aln is the default and highly suited for short-read ancient DNA. BWA mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to BWA aln, and has recently been suggested to provide slightly better results under certain conditions ([Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105)), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln'\n\nMore documentation can be seen for each tool under:\n\n- [BWA aln](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [BWA mem](http://bio-bwa.sourceforge.net/bwa.shtml#3)\n- [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html)\n- [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line)\n", + "enum": [ + "bwaaln", + "bwamem", + "circularmapper", + "bowtie2" + ] + }, + "bwaalnn": { + "type": "number", + "default": 0.01, + "description": "Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in the alignment.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "Configures the `bwa aln -n` parameter, defining how many mismatches are allowed in a read. By default set to `0.04` (following recommendations of [Schubert et al. (2012 _BMC Genomics_)](https://doi.org/10.1186/1471-2164-13-178)), if you're uncertain what to set check out [this](https://apeltzer.shinyapps.io/bwa-mismatches/) Shiny App for more information on how to set this parameter efficiently.\n\n> Modifies bwa aln parameter: `-n`" + }, + "bwaalnk": { + "type": "integer", + "default": 2, + "description": "Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed.", + "fa_icon": "fas fa-drafting-compass", + "help_text": "Configures the `bwa aln -k` parameter for the seeding phase in the mapping algorithm. Default is set to `2`.\n\n> Modifies BWA aln parameter: `-k`" + }, + "bwaalnl": { + "type": "integer", + "default": 1024, + "description": "Specify the -l parameter for BWA aln i.e. the length of seeds to be used.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Configures the length of the seed used in `bwa aln -l`. Default is set to be 'turned off' at the recommendation of Schubert et al. ([2012 _BMC Genomics_](https://doi.org/10.1186/1471-2164-13-178)) for ancient DNA with `1024`.\n\nNote: Despite being recommended, turning off seeding can result in long runtimes!\n\n> Modifies BWA aln parameter: `-l`\n" + }, + "bwaalno": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-people-arrows", + "description": "Specify the -o parameter for BWA aln i.e. the number of gaps allowed.", + "help_text": "Configures the number of gaps used in `bwa aln`. Default is set to `bwa` default.\n\n> Modifies BWA aln parameter: `-o`\n" + }, + "circularextension": { + "type": "integer", + "default": 500, + "description": "Specify the number of bases to extend reference by (circularmapper only).", + "fa_icon": "fas fa-external-link-alt", + "help_text": "The number of bases to extend the reference genome with. By default this is set to `500` if not specified otherwise.\n\n> Modifies circulargenerator and realignsamfile parameter: `-e`" + }, + "circulartarget": { + "type": "string", + "default": "MT", + "description": "Specify the FASTA header of the target chromosome to extend (circularmapper only).", + "fa_icon": "fas fa-bullseye", + "help_text": "The chromosome in your FASTA reference that you'd like to be treated as circular. By default this is set to `MT` but can be configured to match any other chromosome.\n\n> Modifies circulargenerator parameter: `-s`" + }, + "circularfilter": { + "type": "boolean", + "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", + "fa_icon": "fas fa-filter", + "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" + }, + "bt2_alignmode": { + "type": "string", + "default": "local", + "description": "Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [Poullet and Orlando 2020](https://doi.org/10.3389/fevo.2020.00105).\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": [ + "local", + "end-to-end" + ] + }, + "bt2_sensitivity": { + "type": "string", + "default": "sensitive", + "description": "Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'.", + "fa_icon": "fas fa-microscope", + "help_text": "The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment)\n\n> Modifies Bowtie2 parameters: `--very-fast --fast --sensitive --very-sensitive --very-fast-local --fast-local --sensitive-local --very-sensitive-local`", + "enum": [ + "no-preset", + "very-fast", + "fast", + "sensitive", + "very-sensitive" + ] + }, + "bt2n": { + "type": "integer", + "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-sort-numeric-down", + "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", + "default": 0 + }, + "bt2l": { + "type": "integer", + "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", + "default": 0 + }, + "bt2_trim5": { + "type": "integer", + "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", + "default": 0 + }, + "bt2_trim3": { + "type": "integer", + "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", + "fa_icon": "fas fa-cut", + "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", + "default": 0 + }, + "bt2_maxins": { + "type": "integer", + "default": 500, + "fa_icon": "fas fa-exchange-alt", + "description": "Specify the maximum fragment length for Bowtie2 paired-end mapping mode only.", + "help_text": "The maximum fragment for valid paired-end alignments. Only for paired-end mapping (i.e. unmerged), and therefore typically only useful for modern data.\n\n See [Bowtie2 documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) for more information.\n\n> Modifies Bowtie2 parameters: `--maxins`" + } + }, + "fa_icon": "fas fa-layer-group", + "help_text": "If using TSV input, mapping is performed at the library level, i.e. after lane merging.\n" + }, + "host_removal": { + "title": "Removal of Host-Mapped Reads", + "type": "object", + "description": "Options for production of host-read removed FASTQ files for privacy reasons.", + "default": "", + "properties": { + "hostremoval_input_fastq": { + "type": "boolean", + "description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)", + "fa_icon": "fas fa-power-off", + "help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n" + }, + "hostremoval_mode": { + "type": "string", + "default": "remove", + "description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).", + "fa_icon": "fas fa-mask", + "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", + "enum": [ + "strip", + "replace", + "remove" + ] + } + }, + "fa_icon": "fas fa-user-shield", + "help_text": "These parameters are used for removing mapped reads from the original input\nFASTQ files, usually in the context of uploading the original FASTQ files to a\npublic read archive (NCBI SRA/EBI ENA/DDBJ SRA).\n\nThese flags will produce FASTQ files almost identical to your input files,\nexcept that reads with the same read ID as one found in the mapped bam file, are\neither removed or 'masked' (every base replaced with Ns).\n\nThis functionality allows you to provide other researchers who wish to re-use\nyour data to apply their own adapter removal/read merging procedures, while\nmaintaining anonymity for sample donors - for example with microbiome\nresearch.\n\nIf using TSV input, stripping is performed library, i.e. after lane merging." + }, + "bam_filtering": { + "title": "BAM Filtering", + "type": "object", + "description": "Options for quality filtering and how to deal with off-target unmapped reads.", + "default": "", + "properties": { + "run_bam_filtering": { + "type": "boolean", + "description": "Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the bam filtering module for either mapping quality filtering or unmapped read treatment.\n" + }, + "bam_mapping_quality_threshold": { + "type": "integer", + "description": "Minimum mapping quality for reads filter.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", + "default": 0 + }, + "bam_filter_minreadlength": { + "type": "integer", + "fa_icon": "fas fa-ruler-horizontal", + "description": "Specify minimum read length to be kept after mapping.", + "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", + "default": 0 + }, + "bam_unmapped_type": { + "type": "string", + "default": "discard", + "description": "Defines whether to discard all unmapped reads, keep only bam and/or keep only fastq format Options: 'discard', 'bam', 'fastq', 'both'.", + "fa_icon": "fas fa-trash-alt", + "help_text": "Defines how to proceed with unmapped reads: `'discard'` removes all unmapped reads, `keep` keeps both unmapped and mapped reads in the same BAM file, `'bam'` keeps unmapped reads as BAM file, `'fastq'` keeps unmapped reads as FastQ file, `both` keeps both BAM and FASTQ files. Default is `discard`. `keep` is what would happen if `--run_bam_filtering` was _not_ supplied.\n\nNote that in all cases, if `--bam_mapping_quality_threshold` is also supplied, mapping quality filtering will still occur on the mapped reads.\n\n> Modifies samtools view parameter: `-f4 -F4`", + "enum": [ + "discard", + "keep", + "bam", + "fastq", + "both" + ] + } + }, + "fa_icon": "fas fa-sort-amount-down", + "help_text": "Users can configure to keep/discard/extract certain groups of reads efficiently\nin the nf-core/eager pipeline.\n\nIf using TSV input, filtering is performed library, i.e. after lane merging.\n\nThis module utilises `samtools view` and `filter_bam_fragment_length.py`" + }, + "deduplication": { + "title": "DeDuplication", + "type": "object", + "description": "Options for removal of PCR amplicon duplicates that can artificially inflate coverage.", + "default": "", + "properties": { + "dedupper": { + "type": "string", + "default": "markduplicates", + "description": "Deduplication method to use. Options: 'markduplicates', 'dedup'.", + "fa_icon": "fas fa-object-group", + "help_text": "Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool `dedup` ([Peltzer et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered.\n\nThis utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should generally only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data.\n", + "enum": [ + "markduplicates", + "dedup" + ] + }, + "dedup_all_merged": { + "type": "boolean", + "description": "Turn on treating all reads as merged reads.", + "fa_icon": "fas fa-handshake", + "help_text": "Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above).\n\n> Modifies dedup parameter: `-m`" + } + }, + "fa_icon": "fas fa-clone", + "help_text": "If using TSV input, deduplication is performed per library, i.e. after lane merging." + }, + "library_complexity_analysis": { + "title": "Library Complexity Analysis", + "type": "object", + "description": "Options for calculating library complexity (i.e. how many unique reads are present).", + "default": "", + "properties": { + "preseq_mode": { + "type": "string", + "default": "c_curve", + "description": "Specify which mode of preseq to run.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify which mode of preseq to run.\n\nFrom the [PreSeq documentation](http://smithlabresearch.org/wp-content/uploads/manual.pdf): \n\n`c curve` is used to compute the expected complexity curve of a mapped read file with a hypergeometric\nformula\n\n`lc extrap` is used to generate the expected yield for theoretical larger experiments and bounds on the\nnumber of distinct reads in the library and the associated confidence intervals, which is computed by\nbootstrapping the observed duplicate counts histogram", + "enum": [ + "c_curve", + "lc_extrap" + ] + }, + "preseq_step_size": { + "type": "integer", + "default": 1000, + "description": "Specify the step size of Preseq.", + "fa_icon": "fas fa-shoe-prints", + "help_text": "Can be used to configure the step size of Preseq's `c_curve` and `lc_extrap` method. Can be useful when only few and thus shallow sequencing results are used for extrapolation.\n\n> Modifies preseq c_curve and lc_extrap parameter: `-s`" + }, + "preseq_maxextrap": { + "type": "integer", + "default": 10000000000, + "description": "Specify the maximum extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-ban", + "help_text": "Specify the maximum extrapolation that `lc_extrap` mode will perform.\n\n> Modifies preseq lc_extrap parameter: `-e`" + }, + "preseq_terms": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of terms for extrapolation (lc_extrap mode only)", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the maximum number of terms that `lc_extrap` mode will use.\n\n> Modifies preseq lc_extrap parameter: `-x`" + }, + "preseq_bootstrap": { + "type": "integer", + "default": 100, + "description": "Specify number of bootstraps to perform (lc_extrap mode only)", + "fa_icon": "fab fa-bootstrap", + "help_text": "Specify the number of bootstraps `lc_extrap` mode will perform to calculate confidence intervals.\n\n> Modifies preseq lc_extrap parameter: `-n`" + }, + "preseq_cval": { + "type": "number", + "default": 0.95, + "description": "Specify confidence interval level (lc_extrap mode only)", + "fa_icon": "fas fa-check-circle", + "help_text": "Specify the allowed level of confidence intervals used for `lc_extrap` mode.\n\n> Modifies preseq lc_extrap parameter: `-c`" + } + }, + "fa_icon": "fas fa-bezier-curve", + "help_text": "nf-core/eager uses Preseq on mapped reads as one method to calculate library\ncomplexity. If DeDup is used, Preseq uses the histogram output of DeDup,\notherwise the sorted non-duplicated BAM file is supplied. Furthermore, if\npaired-end read collapsing is not performed, the `-P` flag is used." + }, + "adna_damage_analysis": { + "title": "(aDNA) Damage Analysis", + "type": "object", + "description": "Options for calculating and filtering for characteristic ancient DNA damage patterns.", + "default": "", + "properties": { + "damageprofiler_length": { + "type": "integer", + "default": 100, + "description": "Specify length filter for DamageProfiler.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Specifies the length filter for DamageProfiler. By default set to `100`.\n\n> Modifies DamageProfile parameter: `-l`" + }, + "damageprofiler_threshold": { + "type": "integer", + "default": 15, + "description": "Specify number of bases of each read to consider for DamageProfiler calculations.", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Specifies the length of the read start and end to be considered for profile generation in DamageProfiler. By default set to `15` bases.\n\n> Modifies DamageProfile parameter: `-t`" + }, + "damageprofiler_yaxis": { + "type": "number", + "default": 0.3, + "description": "Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'.", + "fa_icon": "fas fa-ruler-vertical", + "help_text": "Specifies what the maximum misincorporation frequency should be displayed as, in the DamageProfiler damage plot. This is set to `0.30` (i.e. 30%) by default as this matches the popular [mapDamage2.0](https://ginolhac.github.io/mapDamage) program. However, the default behaviour of DamageProfiler is to 'autoscale' the y-axis maximum to zoom in on any _possible_ damage that may occur (e.g. if the damage is about 10%, the highest value on the y-axis would be set to 0.12). This 'autoscale' behaviour can be turned on by specifying the number to `0`. Default: `0.30`.\n\n> Modifies DamageProfile parameter: `-yaxis_damageplot`" + }, + "run_pmdtools": { + "type": "boolean", + "description": "Turn on PMDtools", + "fa_icon": "fas fa-power-off", + "help_text": "Specifies to run PMDTools for damage based read filtering and assessment of DNA damage in sequencing libraries. By default turned off.\n" + }, + "pmdtools_range": { + "type": "integer", + "default": 10, + "description": "Specify range of bases for PMDTools to scan for damage.", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "Specifies the range in which to consider DNA damage from the ends of reads. By default set to `10`.\n\n> Modifies PMDTools parameter: `--range`" + }, + "pmdtools_threshold": { + "type": "integer", + "default": 3, + "description": "Specify PMDScore threshold for PMDTools.", + "fa_icon": "fas fa-chart-bar", + "help_text": "Specifies the PMDScore threshold to use in the pipeline when filtering BAM files for DNA damage. Only reads which surpass this damage score are considered for downstream DNA analysis. By default set to `3` if not set specifically by the user.\n\n> Modifies PMDTools parameter: `--threshold`" + }, + "pmdtools_reference_mask": { + "type": "string", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", + "fa_icon": "fas fa-mask", + "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." + }, + "pmdtools_max_reads": { + "type": "integer", + "default": 10000, + "description": "Specify the maximum number of reads to consider for metrics generation.", + "fa_icon": "fas fa-greater-than-equal", + "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" + }, + "pmdtools_platypus": { + "type": "boolean", + "description": "Append big list of base frequencies for platypus to output.", + "fa_icon": "fas fa-power-off", + "help_text": "Enables the printing of a wider list of base frequencies used by platypus as an addition to the output base misincorporation frequency table. By default turned off.\n" + }, + "run_mapdamage_rescaling": { + "type": "boolean", + "fa_icon": "fas fa-map", + "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" + }, + "rescale_length_5p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-right", + "description": "Length of read for mapDamage2 to rescale from 5p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." + }, + "rescale_length_3p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-left", + "description": "Length of read for mapDamage2 to rescale from 3p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." + } + }, + "fa_icon": "fas fa-chart-line", + "help_text": "More documentation can be seen in the follow links for:\n\n- [DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler)\n- [PMDTools documentation](https://github.com/pontussk/PMDtools)\n\nIf using TSV input, DamageProfiler is performed per library, i.e. after lane\nmerging. PMDtools and BAM Trimming is run after library merging of same-named\nlibrary BAMs that have the same type of UDG treatment. BAM Trimming is only\nperformed on non-UDG and half-UDG treated data.\n" + }, + "feature_annotation_statistics": { + "title": "Feature Annotation Statistics", + "type": "object", + "description": "Options for getting reference annotation statistics (e.g. gene coverages)", + "default": "", + "properties": { + "run_bedtools_coverage": { + "type": "boolean", + "description": "Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.", + "fa_icon": "fas fa-chart-area", + "help_text": "Specifies to turn on the bedtools module, producing statistics for breadth (or percent coverage), and depth (or X fold) coverages.\n" + }, + "anno_file": { + "type": "string", + "description": "Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.", + "fa_icon": "fas fa-file-signature", + "help_text": "Specify the path to a GFF/BED containing the feature coordinates (or any acceptable input for [`bedtools coverage`](https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html)). Must be in quotes.\n" + } + }, + "fa_icon": "fas fa-scroll", + "help_text": "If you're interested in looking at coverage stats for certain features on your\nreference such as genes, SNPs etc., you can use the following bedtools module\nfor this purpose.\n\nMore documentation on bedtools can be seen in the [bedtools\ndocumentation](https://bedtools.readthedocs.io/en/latest/)\n\nIf using TSV input, bedtools is run after library merging of same-named library\nBAMs that have the same type of UDG treatment.\n" + }, + "bam_trimming": { + "title": "BAM Trimming", + "type": "object", + "description": "Options for trimming of aligned reads (e.g. to remove damage prior genotyping).", + "default": "", + "properties": { + "run_trim_bam": { + "type": "boolean", + "description": "Turn on BAM trimming. Will only run on non-UDG or half-UDG libraries", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file. Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming is typically performed to reduce errors during genotyping that can be caused by aDNA damage.\n\nBAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`).\n\n> Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping." + }, + "bamutils_clip_double_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_double_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for double-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from double_stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_half_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded half-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_left": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler-combined", + "description": "Specify the number of bases to clip off reads from 'left' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_clip_single_stranded_none_udg_right": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-ruler", + "description": "Specify the number of bases to clip off reads from 'right' end of read for single-stranded non-UDG libraries.", + "help_text": "Default set to `0` and clips off no bases on the left or right side of reads from single-stranded libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read).\n\n> Modifies bamUtil's trimBam parameter: `-L -R`" + }, + "bamutils_softclip": { + "type": "boolean", + "description": "Turn on using softclip instead of hard masking.", + "fa_icon": "fas fa-paint-roller", + "help_text": "By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.\n\n> Modifies bam trimBam parameter: `-c`" + } + }, + "fa_icon": "fas fa-eraser", + "help_text": "For some library preparation protocols, users might want to clip off damaged\nbases before applying genotyping methods. This can be done in nf-core/eager\nautomatically by turning on the `--run_trim_bam` parameter.\n\nMore documentation can be seen in the [bamUtil\ndocumentation](https://genome.sph.umich.edu/wiki/BamUtil:_trimBam)\n" + }, + "genotyping": { + "title": "Genotyping", + "type": "object", + "description": "Options for variant calling.", + "default": "", + "properties": { + "run_genotyping": { + "type": "boolean", + "description": "Turn on genotyping of BAM files.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on genotyping to run on all post-dedup and downstream BAMs. For example if `--run_pmdtools` and `--trim_bam` are both supplied, the genotyper will be run on all three BAM files i.e. post-deduplication, post-pmd and post-trimmed BAM files." + }, + "genotyping_tool": { + "type": "string", + "description": "Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.", + "fa_icon": "fas fa-tools", + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller. Specify 'ug', 'hc', 'freebayes', 'pileupcaller' and 'angsd' respectively.\n\n> > Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does _de novo_ assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", + "enum": [ + "ug", + "hc", + "freebayes", + "pileupcaller", + "angsd" + ] + }, + "genotyping_source": { + "type": "string", + "default": "raw", + "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", + "fa_icon": "fas fa-faucet", + "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", + "enum": [ + "raw", + "pmd", + "trimmed", + "rescaled" + ] + }, + "gatk_call_conf": { + "type": "integer", + "default": 30, + "description": "Specify GATK phred-scaled confidence threshold.", + "fa_icon": "fas fa-balance-scale-right", + "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call. Default: `30`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + }, + "gatk_ploidy": { + "type": "integer", + "default": 2, + "description": "Specify GATK organism ploidy.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms. Default: `2`\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `--sample-ploidy`" + }, + "gatk_downsample": { + "type": "integer", + "default": 250, + "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", + "fa_icon": "fas fa-icicles", + "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to 250 reads. Default: `250`\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + }, + "gatk_dbsnp": { + "type": "string", + "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", + "fa_icon": "fas fa-marker", + "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.\n" + }, + "gatk_hc_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK genotyper HaplotypeCaller is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_ACTIVE_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK HaplotypeCaller parameter: `-output_mode`", + "enum": [ + "EMIT_ALL_ACTIVE_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_hc_emitrefconf": { + "type": "string", + "default": "GVCF", + "description": "Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK HaplotypeCaller is selected, mode for emitting reference confidence calls. Options: `'NONE'`, `'BP_RESOLUTION'`, `'GVCF'`. Default: `'GVCF'`\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`\n", + "enum": [ + "NONE", + "GVCF", + "BP_RESOLUTION" + ] + }, + "gatk_ug_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'.", + "fa_icon": "fas fa-bullhorn", + "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites. Options: `'EMIT_VARIANTS_ONLY'`, `'EMIT_ALL_CONFIDENT_SITES'`, `'EMIT_ALL_SITES'`. Default: `'EMIT_VARIANTS_ONLY'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "enum": [ + "EMIT_ALL_SITES", + "EMIT_ALL_CONFIDENT_SITES", + "EMIT_VARIANTS_ONLY" + ] + }, + "gatk_ug_genotype_model": { + "type": "string", + "default": "SNP", + "description": "Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc. Options: `'SNP'`, `'INDEL'`, `'BOTH'`, `'GENERALPLOIDYSNP'`, `'GENERALPLOIDYINDEL`'. Default: `'SNP'`\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`", + "enum": [ + "SNP", + "INDEL", + "BOTH", + "GENERALPLOIDYSNP", + "GENERALPLOIDYINDEL" + ] + }, + "gatk_ug_keep_realign_bam": { + "type": "boolean", + "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", + "fa_icon": "fas fa-align-left", + "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + }, + "gatk_ug_defaultbasequalities": { + "type": "string", + "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", + "fa_icon": "fas fa-undo-alt", + "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to -1 which is to not set any default quality (turned off). Default: `-1`\n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`" + }, + "freebayes_C": { + "type": "integer", + "default": 1, + "description": "Specify minimum required supporting observations to consider a variant.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify minimum required supporting observations to consider a variant. Default: `1`\n\n> Modifies freebayes parameter: `-C`" + }, + "freebayes_g": { + "type": "integer", + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", + "fa_icon": "fab fa-think-peaks", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", + "default": 0 + }, + "freebayes_p": { + "type": "integer", + "default": 2, + "description": "Specify ploidy of sample in FreeBayes.", + "fa_icon": "fas fa-pastafarianism", + "help_text": "Specify ploidy of sample in FreeBayes. Default is diploid. Default: `2`\n\n> Modifies freebayes parameter: `-p`" + }, + "pileupcaller_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for pileupCaller.", + "fa_icon": "fas fa-bed", + "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate pileup for pileupCaller.\n" + }, + "pileupcaller_snpfile": { + "type": "string", + "description": "Specify path to SNP panel in EIGENSTRAT format for pileupCaller.", + "fa_icon": "fas fa-sliders-h", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/tree/master/CONVERTF) format, pileupCaller will call these sites.\n" + }, + "pileupcaller_method": { + "type": "string", + "default": "randomHaploid", + "description": "Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'.", + "fa_icon": "fas fa-toolbox", + "help_text": "Specify calling method to use. Options: randomHaploid, randomDiploid, majorityCall. Default: `'randomHaploid'`\n\n> Modifies pileupCaller parameter: `--randomHaploid --randomDiploid --majorityCall`", + "enum": [ + "randomHaploid", + "randomDiploid", + "majorityCall" + ] + }, + "pileupcaller_transitions_mode": { + "type": "string", + "default": "AllSites", + "description": "Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'.", + "fa_icon": "fas fa-toggle-on", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. Options: `'AllSites'`, `'TransitionsMissing'`, `'SkipTransitions'`. Default: `'AllSites'`\n\n> Modifies pileupCaller parameter: `--skipTransitions --transitionsMissing`", + "enum": [ + "AllSites", + "TransitionsMissing", + "SkipTransitions" + ] + }, + "pileupcaller_min_map_quality": { + "type": "integer", + "default": 30, + "description": "The minimum mapping quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum mapping quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-q` parameter of samtools mpileup." + }, + "pileupcaller_min_base_quality": { + "type": "integer", + "default": 30, + "description": "The minimum base quality to be used for genotyping.", + "fa_icon": "fas fa-filter", + "help_text": "The minimum base quality to be used for genotyping. Affects the `samtools pileup` output that is used by pileupcaller. Affects `-Q` parameter of samtools mpileup." + }, + "angsd_glmodel": { + "type": "string", + "default": "samtools", + "description": "Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'.", + "fa_icon": "fas fa-project-diagram", + "help_text": "Specify which genotype likelihood model to use. Options: `'samtools`, `'gatk'`, `'soapsnp'`, `'syk'`. Default: `'samtools'`\n\n> Modifies ANGSD parameter: `-GL`", + "enum": [ + "samtools", + "gatk", + "soapsnp", + "syk" + ] + }, + "angsd_glformat": { + "type": "string", + "default": "binary", + "description": "Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'.", + "fa_icon": "fas fa-text-height", + "help_text": "Specifies what type of genotyping likelihood file format will be output. Options: `'text'`, `'binary'`, `'binary_three'`, `'beagle_binary'`. Default: `'text'`.\n\nThe options refer to the following descriptions respectively:\n\n- `text`: textoutput of all 10 log genotype likelihoods.\n- `binary`: binary all 10 log genotype likelihood\n- `binary_three`: binary 3 times likelihood\n- `beagle_binary`: beagle likelihood file\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies ANGSD parameter: `-doGlF`", + "enum": [ + "text", + "binary", + "binary_three", + "beagle" + ] + }, + "angsd_createfasta": { + "type": "boolean", + "description": "Turn on creation of FASTA from ANGSD genotyping likelihood.", + "fa_icon": "fas fa-align-justify", + "help_text": "Turns on the ANGSD creation of a FASTA file from the BAM file.\n" + }, + "angsd_fastamethod": { + "type": "string", + "default": "random", + "description": "Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'.", + "fa_icon": "fas fa-toolbox", + "help_text": "The type of base calling to be performed when creating the ANGSD FASTA file. Options: `'random'` or `'common'`. Will output the most common non-N base at each given position, whereas 'random' will pick one at random. Default: `'random'`.\n\n> Modifies ANGSD parameter: `-doFasta -doCounts`", + "enum": [ + "random", + "common" + ] + }, + "run_bcftools_stats": { + "type": "boolean", + "default": true, + "description": "Turn on bcftools stats generation for VCF based variant calling statistics", + "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics.", + "fa_icon": "far fa-chart-bar" + } + }, + "fa_icon": "fas fa-sliders-h", + "help_text": "There are options for different genotypers (or genotype likelihood calculators)\nto be used. We suggest you read the documentation of each tool to find the ones that\nsuit your needs.\n\nDocumentation for each tool:\n\n- [GATK\n UnifiedGenotyper](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php)\n- [GATK\n HaplotypeCaller](https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php)\n- [FreeBayes](https://github.com/ekg/freebayes)\n- [ANGSD](http://www.popgen.dk/angsd/index.php/Genotype_Likelihoods)\n- [sequenceTools pileupCaller](https://github.com/stschiff/sequenceTools)\n\nIf using TSV input, genotyping is performed per sample (i.e. after all types of\nlibraries are merged), except for pileupCaller which gathers all double-stranded and\nsingle-stranded (same-type merged) libraries respectively." + }, + "consensus_sequence_generation": { + "title": "Consensus Sequence Generation", + "type": "object", + "description": "Options for creation of a per-sample FASTA sequence useful for downstream analysis (e.g. multi sequence alignment)", + "default": "", + "properties": { + "run_vcf2genome": { + "type": "boolean", + "description": "Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on consensus sequence genome creation via VCF2Genome. Only accepts GATK UnifiedGenotyper VCF files with the `--gatk_ug_out_mode 'EMIT_ALL_SITES'` and `--gatk_ug_genotype_model 'SNP` flags. Typically useful for small genomes such as mitochondria.\n" + }, + "vcf2genome_outfile": { + "type": "string", + "description": "Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name.", + "fa_icon": "fas fa-file-alt", + "help_text": "The name of your requested output FASTA file. Do not include `.fasta` suffix.\n" + }, + "vcf2genome_header": { + "type": "string", + "description": "Specify the header name of the consensus sequence entry within the FASTA file.", + "fa_icon": "fas fa-heading", + "help_text": "The name of the FASTA entry you would like in your FASTA file.\n" + }, + "vcf2genome_minc": { + "type": "integer", + "default": 5, + "description": "Minimum depth coverage required for a call to be included (else N will be called).", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "Minimum depth coverage for a SNP to be made. Else, a SNP will be called as N. Default: `5`\n\n> Modifies VCF2Genome parameter: `-minc`" + }, + "vcf2genome_minq": { + "type": "integer", + "default": 30, + "description": "Minimum genotyping quality of a call to be called. Else N will be called.", + "fa_icon": "fas fa-medal", + "help_text": "Minimum genotyping quality of a call to be made. Else N will be called. Default: `30`\n\n> Modifies VCF2Genome parameter: `-minq`" + }, + "vcf2genome_minfreq": { + "type": "number", + "default": 0.8, + "description": "Minimum fraction of reads supporting a call to be included. Else N will be called.", + "fa_icon": "fas fa-percent", + "help_text": "In the case of two possible alleles, the frequency of the majority allele required for a call to be made. Else, a SNP will be called as N. Default: `0.8`\n\n> Modifies VCF2Genome parameter: `-minfreq`" + } + }, + "fa_icon": "fas fa-handshake", + "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." + }, + "snp_table_generation": { + "title": "SNP Table Generation", + "type": "object", + "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", + "default": "", + "properties": { + "run_multivcfanalyzer": { + "type": "boolean", + "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", + "fa_icon": "fas fa-power-off", + "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" + }, + "write_allele_frequencies": { + "type": "boolean", + "description": "Turn on writing write allele frequencies in the SNP table.", + "fa_icon": "fas fa-pen", + "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" + }, + "min_genotype_quality": { + "type": "integer", + "default": 30, + "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", + "fa_icon": "fas fa-medal", + "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" + }, + "min_base_coverage": { + "type": "integer", + "default": 5, + "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", + "fa_icon": "fas fa-sort-amount-up", + "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" + }, + "min_allele_freq_hom": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" + }, + "min_allele_freq_het": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", + "fa_icon": "fas fa-percent", + "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." + }, + "additional_vcf_files": { + "type": "string", + "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", + "fa_icon": "fas fa-copy", + "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." + }, + "reference_gff_annotations": { + "type": "string", + "default": "NA", + "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", + "fa_icon": "fas fa-file-signature", + "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" + }, + "reference_gff_exclude": { + "type": "string", + "default": "NA", + "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", + "fa_icon": "fas fa-times", + "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" + }, + "snp_eff_results": { + "type": "string", + "default": "NA", + "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", + "fa_icon": "fas fa-magic", + "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." + } + }, + "fa_icon": "fas fa-table", + "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." + }, + "mitochondrial_to_nuclear_ratio": { + "title": "Mitochondrial to Nuclear Ratio", + "type": "object", + "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", + "default": "", + "properties": { + "run_mtnucratio": { + "type": "boolean", + "description": "Turn on mitochondrial to nuclear ratio calculation.", + "fa_icon": "fas fa-balance-scale-left", + "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" + }, + "mtnucratio_header": { + "type": "string", + "default": "MT", + "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", + "fa_icon": "fas fa-heading", + "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" + } + }, + "fa_icon": "fas fa-balance-scale-left", + "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" + }, + "human_sex_determination": { + "title": "Human Sex Determination", + "type": "object", + "description": "Options for the calculation of biological sex of human individuals.", + "default": "", + "properties": { + "run_sexdeterrmine": { + "type": "boolean", + "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", + "fa_icon": "fas fa-transgender-alt", + "help_text": "Specify to run the optional process of sex determination.\n" + }, + "sexdeterrmine_bedfile": { + "type": "string", + "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", + "fa_icon": "fas fa-bed", + "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." + } + }, + "fa_icon": "fas fa-transgender", + "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." + }, + "nuclear_contamination_for_human_dna": { + "title": "Nuclear Contamination for Human DNA", + "type": "object", + "description": "Options for the estimation of contamination of human DNA.", + "default": "", + "properties": { + "run_nuclear_contamination": { + "type": "boolean", + "description": "Turn on nuclear contamination estimation for human reference genomes.", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" + }, + "contamination_chrom_name": { + "type": "string", + "default": "X", + "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", + "fa_icon": "fas fa-address-card", + "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." + } + }, + "fa_icon": "fas fa-radiation-alt" + }, + "metagenomic_screening": { + "title": "Metagenomic Screening", + "type": "object", + "description": "Options for metagenomic screening of off-target reads.", + "default": "", + "properties": { + "metagenomic_complexity_filter": { + "type": "boolean", + "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", + "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", + "fa_icon": "fas fa-filter" + }, + "metagenomic_complexity_entropy": { + "type": "number", + "default": 0.3, + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "minimum": 0, + "maximum": 1, + "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", + "fa_icon": "fas fa-percent" + }, + "run_metagenomic_screening": { + "type": "boolean", + "description": "Turn on metagenomic screening module for reference-unmapped reads.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on the metagenomic screening module.\n" + }, + "metagenomic_tool": { + "type": "string", + "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", + "fa_icon": "fas fa-tools", + "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" + }, + "database": { + "type": "string", + "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", + "fa_icon": "fas fa-database", + "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." + }, + "metagenomic_min_support_reads": { + "type": "integer", + "default": 1, + "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + }, + "percent_identity": { + "type": "integer", + "default": 85, + "description": "Percent identity value threshold for MALT.", + "fa_icon": "fas fa-id-card", + "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + }, + "malt_mode": { + "type": "string", + "default": "BlastN", + "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", + "fa_icon": "fas fa-align-left", + "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", + "enum": [ + "BlastN", + "BlastP", + "BlastX" + ] + }, + "malt_alignment_mode": { + "type": "string", + "default": "SemiGlobal", + "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", + "fa_icon": "fas fa-align-center", + "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", + "enum": [ + "Local", + "SemiGlobal" + ] + }, + "malt_top_percent": { + "type": "integer", + "default": 1, + "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", + "fa_icon": "fas fa-percent", + "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + }, + "malt_min_support_mode": { + "type": "string", + "default": "percent", + "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", + "fa_icon": "fas fa-drumstick-bite", + "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", + "enum": [ + "percent", + "reads" + ] + }, + "malt_min_support_percent": { + "type": "number", + "default": 0.01, + "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", + "fa_icon": "fas fa-percentage", + "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + }, + "malt_max_queries": { + "type": "integer", + "default": 100, + "description": "Specify the maximum number of queries a read can have for MALT.", + "fa_icon": "fas fa-phone", + "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + }, + "malt_memory_mode": { + "type": "string", + "default": "load", + "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", + "fa_icon": "fas fa-memory", + "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", + "enum": [ + "load", + "page", + "map" + ] + }, + "malt_sam_output": { + "type": "boolean", + "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", + "fa_icon": "fas fa-file-alt", + "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + } + }, + "fa_icon": "fas fa-search", + "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." + }, + "metagenomic_authentication": { + "title": "Metagenomic Authentication", + "type": "object", + "description": "Options for authentication of metagenomic screening performed by MALT.", + "default": "", + "properties": { + "run_maltextract": { + "type": "boolean", + "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", + "fa_icon": "fas fa-power-off", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" + }, + "maltextract_taxon_list": { + "type": "string", + "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", + "fa_icon": "fas fa-list-ul", + "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_ncbifiles": { + "type": "string", + "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", + "fa_icon": "fas fa-database", + "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." + }, + "maltextract_filter": { + "type": "string", + "default": "def_anc", + "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", + "fa_icon": "fas fa-filter", + "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", + "enum": [ + "def_anc", + "default", + "ancient", + "scan", + "crawl", + "srna" + ] + }, + "maltextract_toppercent": { + "type": "number", + "default": 0.01, + "description": "Specify percent of top alignments to use.", + "fa_icon": "fas fa-percent", + "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" + }, + "maltextract_destackingoff": { + "type": "boolean", + "description": "Turn off destacking.", + "fa_icon": "fas fa-align-center", + "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" + }, + "maltextract_downsamplingoff": { + "type": "boolean", + "description": "Turn off downsampling.", + "fa_icon": "fab fa-creative-commons-sampling", + "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" + }, + "maltextract_duplicateremovaloff": { + "type": "boolean", + "description": "Turn off duplicate removal.", + "fa_icon": "fas fa-align-left", + "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" + }, + "maltextract_matches": { + "type": "boolean", + "description": "Turn on exporting alignments of hits in BLAST format.", + "fa_icon": "fas fa-equals", + "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" + }, + "maltextract_megansummary": { + "type": "boolean", + "description": "Turn on export of MEGAN summary files.", + "fa_icon": "fas fa-download", + "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" + }, + "maltextract_percentidentity": { + "type": "number", + "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", + "default": 85, + "fa_icon": "fas fa-id-card", + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" + }, + "maltextract_topalignment": { + "type": "boolean", + "description": "Turn on using top alignments per read after filtering.", + "fa_icon": "fas fa-star-half-alt", + "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" + } + }, + "fa_icon": "fas fa-tasks", + "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" } - }, - "fa_icon": "fas fa-handshake", - "help_text": "If using TSV input, consensus generation is performed per sample (i.e. after all\ntypes of libraries are merged)." }, - "snp_table_generation": { - "title": "SNP Table Generation", - "type": "object", - "description": "Options for creation of a SNP table useful for downstream analysis (e.g. estimation of cross-mapping of different species and multi-sequence alignment)", - "default": "", - "properties": { - "run_multivcfanalyzer": { - "type": "boolean", - "description": "Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input.", - "fa_icon": "fas fa-power-off", - "help_text": "Turns on MultiVCFAnalyzer. Will only work when in combination with UnifiedGenotyper genotyping module.\n" + "allOf": [ + { + "$ref": "#/definitions/input_output_options" }, - "write_allele_frequencies": { - "type": "boolean", - "description": "Turn on writing write allele frequencies in the SNP table.", - "fa_icon": "fas fa-pen", - "help_text": "Specify whether to tell MultiVCFAnalyzer to write within the SNP table the frequencies of the allele at that position e.g. A (70%).\n" + { + "$ref": "#/definitions/input_data_additional_options" }, - "min_genotype_quality": { - "type": "integer", - "default": 30, - "description": "Specify the minimum genotyping quality threshold for a SNP to be called.", - "fa_icon": "fas fa-medal", - "help_text": "The minimal genotyping quality for a SNP to be considered for processing by MultiVCFAnalyzer. The default threshold is `30`.\n" + { + "$ref": "#/definitions/reference_genome_options" }, - "min_base_coverage": { - "type": "integer", - "default": 5, - "description": "Specify the minimum number of reads a position needs to be covered to be considered for base calling.", - "fa_icon": "fas fa-sort-amount-up", - "help_text": "The minimal number of reads covering a base for a SNP at that position to be considered for processing by MultiVCFAnalyzer. The default depth is `5`.\n" + { + "$ref": "#/definitions/output_options" }, - "min_allele_freq_hom": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimal frequency of a nucleotide for a 'homozygous' SNP to be called. In other words, e.g. 90% of the reads covering that position must have that SNP to be called. If the threshold is not reached, and the previous two parameters are matched, a reference call is made (displayed as . in the SNP table). If the above two parameters are not met, an 'N' is called. The default allele frequency is `0.9`.\n" + { + "$ref": "#/definitions/generic_options" }, - "min_allele_freq_het": { - "type": "number", - "default": 0.9, - "description": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", - "fa_icon": "fas fa-percent", - "help_text": "The minimum frequency of a nucleotide for a 'heterozygous' SNP to be called. If\nthis parameter is set to the same as `--min_allele_freq_hom`, then only\nhomozygous calls are made. If this value is less than the previous parameter,\nthen a SNP call will be made. If it is between this and the previous parameter,\nit will be displayed as a IUPAC uncertainty call. Default is `0.9`." + { + "$ref": "#/definitions/max_job_request_options" }, - "additional_vcf_files": { - "type": "string", - "description": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files.", - "fa_icon": "fas fa-copy", - "help_text": "If you wish to add to the table previously created VCF files, specify here a path with wildcards (in quotes). These VCF files must be created the same way as your settings for [GATK UnifiedGenotyping](#genotyping-parameters) module above." + { + "$ref": "#/definitions/institutional_config_options" }, - "reference_gff_annotations": { - "type": "string", - "default": "NA", - "description": "Specify path to the reference genome annotations in '.gff' format. Optional.", - "fa_icon": "fas fa-file-signature", - "help_text": "If you wish to report in the SNP table annotation information for the regions\nSNPs fall in, provide a file in GFF format (the path must be in quotes).\n" + { + "$ref": "#/definitions/skip_steps" }, - "reference_gff_exclude": { - "type": "string", - "default": "NA", - "description": "Specify path to the positions to be excluded in '.gff' format. Optional.", - "fa_icon": "fas fa-times", - "help_text": "If you wish to exclude SNP regions from consideration by MultiVCFAnalyzer (such as for problematic regions), provide a file in GFF format (the path must be in quotes).\n" + { + "$ref": "#/definitions/complexity_filtering" }, - "snp_eff_results": { - "type": "string", - "default": "NA", - "description": "Specify path to the output file from SNP effect analysis in '.txt' format. Optional.", - "fa_icon": "fas fa-magic", - "help_text": "If you wish to include results from SNPEff effect analysis, supply the output\nfrom SNPEff in txt format (the path must be in quotes)." - } - }, - "fa_icon": "fas fa-table", - "help_text": "SNP Table Generation here is performed by MultiVCFAnalyzer. The current version\nof MultiVCFAnalyzer version only accepts GATK UnifiedGenotyper 3.5 VCF files,\nand when the ploidy was set to 2 (this allows MultiVCFAnalyzer to report\nfrequencies of polymorphic positions). A description of how the tool works can\nbe seen in the Supplementary Information of [Bos et al.\n(2014)](https://doi.org/10.1038/nature13591) under \"SNP Calling and Phylogenetic\nAnalysis\".\n\nMore can be seen in the [MultiVCFAnalyzer\ndocumentation](https://github.com/alexherbig/MultiVCFAnalyzer).\n\nIf using TSV input, MultiVCFAnalyzer is performed on all samples gathered\ntogether." - }, - "mitochondrial_to_nuclear_ratio": { - "title": "Mitochondrial to Nuclear Ratio", - "type": "object", - "description": "Options for the calculation of ratio of reads to one chromosome/FASTA entry against all others.", - "default": "", - "properties": { - "run_mtnucratio": { - "type": "boolean", - "description": "Turn on mitochondrial to nuclear ratio calculation.", - "fa_icon": "fas fa-balance-scale-left", - "help_text": "Turn on the module to estimate the ratio of mitochondrial to nuclear reads.\n" + { + "$ref": "#/definitions/read_merging_and_adapter_removal" }, - "mtnucratio_header": { - "type": "string", - "default": "MT", - "description": "Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space).", - "fa_icon": "fas fa-heading", - "help_text": "Specify the FASTA entry in the reference file specified as `--fasta`, which acts\nas the mitochondrial 'chromosome' to base the ratio calculation on. The tool\nonly accepts the first section of the header before the first space. The default\nchromosome name is based on hs37d5/GrCH37 human reference genome. Default: 'MT'" - } - }, - "fa_icon": "fas fa-balance-scale-left", - "help_text": "If using TSV input, Mitochondrial to Nuclear Ratio calculation is calculated per\ndeduplicated library (after lane merging)" - }, - "human_sex_determination": { - "title": "Human Sex Determination", - "type": "object", - "description": "Options for the calculation of biological sex of human individuals.", - "default": "", - "properties": { - "run_sexdeterrmine": { - "type": "boolean", - "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", - "fa_icon": "fas fa-transgender-alt", - "help_text": "Specify to run the optional process of sex determination.\n" - }, - "sexdeterrmine_bedfile": { - "type": "string", - "description": "Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation).", - "fa_icon": "fas fa-bed", - "help_text": "Specify an optional bedfile of the list of SNPs to be used for X-/Y-rate calculation. Running without this parameter will considerably increase runtime, and render the resulting error bars untrustworthy. Theoretically, any set of SNPs that are distant enough that two SNPs are unlikely to be covered by the same read can be used here. The programme was coded with the 1240K panel in mind. The path must be in quotes." - } - }, - "fa_icon": "fas fa-transgender", - "help_text": "An optional process for human DNA. It can be used to calculate the relative\ncoverage of X and Y chromosomes compared to the autosomes (X-/Y-rate). Standard\nerrors for these measurements are also calculated, assuming a binomial\ndistribution of reads across the SNPs.\n\nIf using TSV input, SexDetERRmine is performed on all samples gathered together." - }, - "nuclear_contamination_for_human_dna": { - "title": "Nuclear Contamination for Human DNA", - "type": "object", - "description": "Options for the estimation of contamination of human DNA.", - "default": "", - "properties": { - "run_nuclear_contamination": { - "type": "boolean", - "description": "Turn on nuclear contamination estimation for human reference genomes.", - "fa_icon": "fas fa-power-off", - "help_text": "Specify to run the optional processes for (human) nuclear DNA contamination estimation.\n" + { + "$ref": "#/definitions/mapping" }, - "contamination_chrom_name": { - "type": "string", - "default": "X", - "description": "The name of the X chromosome in your bam/FASTA header. 'X' for hs37d5, 'chrX' for HG19.", - "fa_icon": "fas fa-address-card", - "help_text": "The name of the human chromosome X in your bam. `'X'` for hs37d5, `'chrX'` for HG19. Defaults to `'X'`." - } - }, - "fa_icon": "fas fa-radiation-alt" - }, - "metagenomic_screening": { - "title": "Metagenomic Screening", - "type": "object", - "description": "Options for metagenomic screening of off-target reads.", - "default": "", - "properties": { - "metagenomic_complexity_filter": { - "type": "boolean", - "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", - "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", - "fa_icon": "fas fa-filter" + { + "$ref": "#/definitions/host_removal" }, - "metagenomic_complexity_entropy": { - "type": "number", - "default": 0.3, - "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", - "minimum": 0, - "maximum": 1, - "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", - "fa_icon": "fas fa-percent" + { + "$ref": "#/definitions/bam_filtering" }, - "run_metagenomic_screening": { - "type": "boolean", - "description": "Turn on metagenomic screening module for reference-unmapped reads.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on the metagenomic screening module.\n" + { + "$ref": "#/definitions/deduplication" }, - "metagenomic_tool": { - "type": "string", - "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", - "fa_icon": "fas fa-tools", - "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" + { + "$ref": "#/definitions/library_complexity_analysis" }, - "database": { - "type": "string", - "description": "Specify path to classifier database directory. For Kraken2 this can also be a `.tar.gz` of the directory.", - "fa_icon": "fas fa-database", - "help_text": "Specify the path to the _directory_ containing your taxonomic classifier's database (malt or kraken).\n\nFor Kraken2, it can be either the path to the _directory_ or the path to the `.tar.gz` compressed directory of the Kraken2 database." + { + "$ref": "#/definitions/adna_damage_analysis" }, - "metagenomic_min_support_reads": { - "type": "integer", - "default": 1, - "description": "Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with --malt_min_support_mode 'percent'.", - "fa_icon": "fas fa-sort-numeric-up-alt", - "help_text": "Specify the minimum number of reads a given taxon is required to have to be retained as a positive 'hit'. \nFor malt, this only applies when `--malt_min_support_mode` is set to 'reads'. Default: 1.\n\n> Modifies MALT or kraken_parse.py parameter: `-sup` and `-c` respectively\n" + { + "$ref": "#/definitions/feature_annotation_statistics" }, - "percent_identity": { - "type": "integer", - "default": 85, - "description": "Percent identity value threshold for MALT.", - "fa_icon": "fas fa-id-card", - "help_text": "Specify the minimum percent identity (or similarity) a sequence must have to the reference for it to be retained. Default is `85`\n\nOnly used when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-id`" + { + "$ref": "#/definitions/bam_trimming" }, - "malt_mode": { - "type": "string", - "default": "BlastN", - "description": "Specify which alignment mode to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'.", - "fa_icon": "fas fa-align-left", - "help_text": "Use this to run the program in 'BlastN', 'BlastP', 'BlastX' modes to align DNA\nand DNA, protein and protein, or DNA reads against protein references\nrespectively. Ensure your database matches the mode. Check the\n[MALT\nmanual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf)\nfor more details. Default: `'BlastN'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-m`\n", - "enum": ["BlastN", "BlastP", "BlastX"] + { + "$ref": "#/definitions/genotyping" }, - "malt_alignment_mode": { - "type": "string", - "default": "SemiGlobal", - "description": "Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'.", - "fa_icon": "fas fa-align-center", - "help_text": "Specify what alignment algorithm to use. Options are 'Local' or 'SemiGlobal'. Local is a BLAST like alignment, but is much slower. Semi-global alignment aligns reads end-to-end. Default: `'SemiGlobal'`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-at`", - "enum": ["Local", "SemiGlobal"] + { + "$ref": "#/definitions/consensus_sequence_generation" }, - "malt_top_percent": { - "type": "integer", - "default": 1, - "description": "Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual).", - "fa_icon": "fas fa-percent", - "help_text": "Specify the top percent value of the LCA algorithm. From the [MALT manual](http://ab.inf.uni-tuebingen.de/data/software/malt/download/manual.pdf): \"For each\nread, only those matches are used for taxonomic placement whose bit disjointScore is within\n10% of the best disjointScore for that read.\". Default: `1`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-top`" + { + "$ref": "#/definitions/snp_table_generation" }, - "malt_min_support_mode": { - "type": "string", - "default": "percent", - "description": "Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'.", - "fa_icon": "fas fa-drumstick-bite", - "help_text": "Specify whether to use a percentage, or raw number of reads as the value used to decide the minimum support a taxon requires to be retained.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-sup -supp`", - "enum": ["percent", "reads"] + { + "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" }, - "malt_min_support_percent": { - "type": "number", - "default": 0.01, - "description": "Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT.", - "fa_icon": "fas fa-percentage", - "help_text": "Specify the minimum number of reads (as a percentage of all assigned reads) a given taxon is required to have to be retained as a positive 'hit' in the RMA6 file. This only applies when `--malt_min_support_mode` is set to 'percent'. Default 0.01.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-supp`" + { + "$ref": "#/definitions/human_sex_determination" }, - "malt_max_queries": { - "type": "integer", - "default": 100, - "description": "Specify the maximum number of queries a read can have for MALT.", - "fa_icon": "fas fa-phone", - "help_text": "Specify the maximum number of alignments a read can have. All further alignments are discarded. Default: `100`\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `-mq`" + { + "$ref": "#/definitions/nuclear_contamination_for_human_dna" }, - "malt_memory_mode": { - "type": "string", - "default": "load", - "description": "Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'.", - "fa_icon": "fas fa-memory", - "help_text": "\nHow to load the database into memory. Options are `'load'`, `'page'` or `'map'`.\n'load' directly loads the entire database into memory prior seed look up, this\nis slow but compatible with all servers/file systems. `'page'` and `'map'`\nperform a sort of 'chunked' database loading, allowing seed look up prior entire\ndatabase loading. Note that Page and Map modes do not work properly not with\nmany remote file-systems such as GPFS. Default is `'load'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MALT parameter: `--memoryMode`", - "enum": ["load", "page", "map"] + { + "$ref": "#/definitions/metagenomic_screening" }, - "malt_sam_output": { - "type": "boolean", - "description": "Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.", - "fa_icon": "fas fa-file-alt", - "help_text": "Specify to _also_ produce gzipped SAM files of all alignments and un-aligned reads in addition to RMA6 files. These are **not** soft-clipped or in 'sparse' format. Can be useful for downstream analyses due to more common file format. \n\n:warning: can result in very large run output directories as this is essentially duplication of the RMA6 files.\n\n> Modifies MALT parameter `-a -f`" + { + "$ref": "#/definitions/metagenomic_authentication" } - }, - "fa_icon": "fas fa-search", - "help_text": "\nAn increasingly common line of analysis in high-throughput aDNA analysis today\nis simultaneously screening off target reads of the host for endogenous\nmicrobial signals - particularly of pathogens. Metagenomic screening is\ncurrently offered via MALT with aDNA specific verification via MaltExtract, or\nKraken2.\n\nPlease note the following:\n\n- :warning: Metagenomic screening is only performed on _unmapped_ reads from a\n mapping step.\n - You _must_ supply the `--run_bam_filtering` flag with unmapped reads in\n FASTQ format.\n - If you wish to run solely MALT (i.e. the HOPS pipeline), you must still\n supply a small decoy genome like phiX or human mtDNA `--fasta`.\n- MALT database construction functionality is _not_ included within the pipeline\n - this should be done independently, **prior** the nf-core/eager run.\n - To use `malt-build` from the same version as `malt-run`, load either the\n Docker, Singularity or Conda environment.\n- MALT can often require very large computing resources depending on your\n database. We set a absolute minimum of 16 cores and 128GB of memory (which is\n 1/4 of the recommendation from the developer). Please leave an issue on the\n [nf-core github](https://github.com/nf-core/eager/issues) if you would like to\n see this changed.\n\n> :warning: Running MALT on a server with less than 128GB of memory should be\n> performed at your own risk.\n\nIf using TSV input, metagenomic screening is performed on all samples gathered\ntogether." - }, - "metagenomic_authentication": { - "title": "Metagenomic Authentication", - "type": "object", - "description": "Options for authentication of metagenomic screening performed by MALT.", - "default": "", - "properties": { - "run_maltextract": { - "type": "boolean", - "description": "Turn on MaltExtract for MALT aDNA characteristics authentication.", - "fa_icon": "fas fa-power-off", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic output from MALT.\n\nMore can be seen in the [MaltExtract documentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" - }, - "maltextract_taxon_list": { - "type": "string", - "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "fa_icon": "fas fa-list-ul", - "help_text": "\nPath to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format.\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_ncbifiles": { - "type": "string", - "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "fa_icon": "fas fa-database", - "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\n\nOnly when `--metagenomic_tool malt` is also supplied." - }, - "maltextract_filter": { - "type": "string", - "default": "def_anc", - "description": "Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'.", - "fa_icon": "fas fa-filter", - "help_text": "Specify which MaltExtract filter to use. This is used to specify what types of characteristics to scan for. The default will output statistics on all alignments, and then a second set with just reads with one C to T mismatch in the first 5 bases. Further details on other parameters can be seen in the [HOPS documentation](https://github.com/rhuebler/HOPS/#maltextract-parameters). Options: `'def_anc'`, `'ancient'`, `'default'`, `'crawl'`, `'scan'`, `'srna'`, 'assignment'. Default: `'def_anc'`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-f`", - "enum": ["def_anc", "default", "ancient", "scan", "crawl", "srna"] - }, - "maltextract_toppercent": { - "type": "number", - "default": 0.01, - "description": "Specify percent of top alignments to use.", - "fa_icon": "fas fa-percent", - "help_text": "Specify frequency of top alignments for each read to be considered for each node.\nDefault is 0.01, i.e. 1% of all reads (where 1 would correspond to 100%).\n\n> :warning: this parameter follows the same concept as `--malt_top_percent` but\n> uses a different notation i.e. integer (MALT) versus float (MALTExtract)\n\nDefault: `0.01`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `-a`" - }, - "maltextract_destackingoff": { - "type": "boolean", - "description": "Turn off destacking.", - "fa_icon": "fas fa-align-center", - "help_text": "Turn off destacking. If left on, a read that overlaps with another read will be\nremoved (leaving a depth coverage of 1).\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--destackingOff`" - }, - "maltextract_downsamplingoff": { - "type": "boolean", - "description": "Turn off downsampling.", - "fa_icon": "fab fa-creative-commons-sampling", - "help_text": "Turn off downsampling. By default, downsampling is on and will randomly select 10,000 reads if the number of reads on a node exceeds this number. This is to speed up processing, under the assumption at 10,000 reads the species is a 'true positive'.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--downSampOff`" - }, - "maltextract_duplicateremovaloff": { - "type": "boolean", - "description": "Turn off duplicate removal.", - "fa_icon": "fas fa-align-left", - "help_text": "\nTurn off duplicate removal. By default, reads that are an exact copy (i.e. same start, stop coordinate and exact sequence match) will be removed as it is considered a PCR duplicate.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--dupRemOff`" - }, - "maltextract_matches": { - "type": "boolean", - "description": "Turn on exporting alignments of hits in BLAST format.", - "fa_icon": "fas fa-equals", - "help_text": "\nExport alignments of hits for each node in BLAST format. By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--matches`" - }, - "maltextract_megansummary": { - "type": "boolean", - "description": "Turn on export of MEGAN summary files.", - "fa_icon": "fas fa-download", - "help_text": "Export 'minimal' summary files (i.e. without alignments) that can be loaded into [MEGAN6](https://doi.org/10.1371/journal.pcbi.1004957). By default turned off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--meganSummary`" - }, - "maltextract_percentidentity": { - "type": "number", - "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", - "default": 85, - "fa_icon": "fas fa-id-card", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" - }, - "maltextract_topalignment": { - "type": "boolean", - "description": "Turn on using top alignments per read after filtering.", - "fa_icon": "fas fa-star-half-alt", - "help_text": "Use the best alignment of each read for every statistic, except for those concerning read distribution and coverage. Default: off.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--useTopAlignment`" - } - }, - "fa_icon": "fas fa-tasks", - "help_text": "Turn on MaltExtract for MALT aDNA characteristics authentication of metagenomic\noutput from MALT.\n\nMore can be seen in the [MaltExtract\ndocumentation](https://github.com/rhuebler/)\n\nOnly when `--metagenomic_tool malt` is also supplied" - } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/input_data_additional_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/output_options" - }, - { - "$ref": "#/definitions/generic_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/skip_steps" - }, - { - "$ref": "#/definitions/complexity_filtering" - }, - { - "$ref": "#/definitions/read_merging_and_adapter_removal" - }, - { - "$ref": "#/definitions/mapping" - }, - { - "$ref": "#/definitions/host_removal" - }, - { - "$ref": "#/definitions/bam_filtering" - }, - { - "$ref": "#/definitions/deduplication" - }, - { - "$ref": "#/definitions/library_complexity_analysis" - }, - { - "$ref": "#/definitions/adna_damage_analysis" - }, - { - "$ref": "#/definitions/feature_annotation_statistics" - }, - { - "$ref": "#/definitions/bam_trimming" - }, - { - "$ref": "#/definitions/genotyping" - }, - { - "$ref": "#/definitions/consensus_sequence_generation" - }, - { - "$ref": "#/definitions/snp_table_generation" - }, - { - "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" - }, - { - "$ref": "#/definitions/human_sex_determination" - }, - { - "$ref": "#/definitions/nuclear_contamination_for_human_dna" - }, - { - "$ref": "#/definitions/metagenomic_screening" - }, - { - "$ref": "#/definitions/metagenomic_authentication" - } - ] -} + ] +} \ No newline at end of file From 391489a9b49169f6983085f2f1bae1148b7f7d81 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:32:46 +0200 Subject: [PATCH 10/15] Fix markdown linting --- CODE_OF_CONDUCT.md | 2 +- docs/output.md | 22 +++++++++++----------- docs/usage.md | 14 +++++++------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f1..f4a82a677 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -44,7 +44,7 @@ The safety officer in consultation with the nf-core core team have the right and Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When are where does this Code of Conduct apply Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: diff --git a/docs/output.md b/docs/output.md index 8acdbe1d4..fcd345f16 100644 --- a/docs/output.md +++ b/docs/output.md @@ -107,13 +107,13 @@ For other non-default columns (activated under 'Configure Columns'), hover over You will receive output for each supplied FASTQ file. -When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will *normally* be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). +When dealing with ancient DNA data the MultiQC plots for FastQC will often show lots of 'warning' or 'failed' samples. You generally can discard this sort of information as we are dealing with very degraded and metagenomic samples which have artefacts that violate the FastQC 'quality definitions', while still being valid data for aDNA researchers. Instead you will _normally_ be looking for 'global' patterns across all samples of a sequencing run to check for library construction or sequencing failures. Decision on whether a individual sample has 'failed' or not should be made by the user after checking all the plots themselves (e.g. if the sample is consistently an outlier to all others in the run). [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. +> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. > :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming. #### Sequence Counts @@ -284,7 +284,7 @@ You will receive output for each FASTQ file supplied for single end data, or for These stacked bars plots are unfortunately a little confusing, when displayed in MultiQC. However are relatively straight-forward once you understand each category. They can be displayed as counts of reads per AdapterRemoval read-category, or as percentages of the same values. Each forward(/reverse) file combination are displayed once. -The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar *includes* the other categories displayed (see below) in the calculation. +The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar _includes_ the other categories displayed (see below) in the calculation. Other Categories: @@ -323,7 +323,7 @@ With paired-end ancient DNA sequencing runs You expect to see a slight increase This module provides information on mapping when running the Bowtie2 aligner. Bowtie2, like bwa, takes raw FASTQ reads and finds the most likely place on the reference genome it derived from. While this module is somewhat redundant with the [Samtools](#samtools) (which reports mapping statistics for bwa) and the endorSp.y endogenous DNA value in the general statistics table, it does provide some details that could be useful in certain contexts. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Single/Paired-end alignments @@ -343,7 +343,7 @@ The main additional useful information compared to [Samtools](#samtools) is that MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Metagenomic Mappability @@ -378,7 +378,7 @@ Kraken is another metagenomic classifier, but takes a different approach to alig It is useful when you do not have large computing power or you want very rapid but rough approximation of the metagenomic profile of your sample. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Top Taxa @@ -396,7 +396,7 @@ However for screening for specific metagenomic profiles, such as ancient microbi This module provides numbers in raw counts of the mapping of your DNA reads to your reference genome. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Flagstat Plot @@ -416,7 +416,7 @@ The remaining rows will be 0 when running `bwa aln` as these characteristics of ### DeDup -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Background @@ -476,7 +476,7 @@ There are two algorithms from the tools we use: `c_curve` and `lc_extrap`. The f Due to endogenous DNA being so low when doing initial screening, the maths behind `lc_extrap` often fails as there is not enough data. Therefore nf-core/eager sticks with `c_curve` which gives a similar approximation of the library complexity, but is more robust to smaller datasets. -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Complexity Curve @@ -506,7 +506,7 @@ Therefore, three main characteristics of ancient DNA are: * Elevated G and As (purines) just before strand breaks * Increased C and Ts at ends of fragments -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Misincorporation Plots @@ -547,7 +547,7 @@ Qualimap is a tool which provides statistics on the quality of the mapping of yo Note that many of the statistics from this module are displayed in the General Stats table (see above), as they represent single values that are not plottable. -You will receive output for each *sample*. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). +You will receive output for each _sample_. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). :warning: If your library has no reads mapping to the reference, this will result in an empty BAM file. Qualimap will therefore not produce any output even if a BAM exists! diff --git a/docs/usage.md b/docs/usage.md index 454b10a93..b63b6120a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -531,7 +531,7 @@ and investigate the log and error messages that are produced by each command of the process. For example, in the error in -[1a](#1a-Nextflow-reports-an-error-executing-process-with-command-error) you can +[1a](#1a-nextflow-reports-an-error-executing-process-with-command-error) you can see the following line ```bash @@ -1447,7 +1447,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Human Pop-Gen - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -1703,7 +1703,7 @@ each `Lane`, but the `Sample_Name` and `Library_ID` columns identify and group them together accordingly. Secondly, as we have NextSeq data, we have specified we have `2` for `Colour_Chemistry`, which is important for downstream processing (see below). The other columns are less important for this particular context of -metagenomic screening. See the nf-core/eager [usage](#pipeline-options) +metagenomic screening. See the nf-core/eager [parameters](https://nf-core/eager/parameters) documentation for more specifications on how to set up a TSV file (e.g. why despite NextSeqs only having 4 lanes, we go up to 8 in the example above). @@ -1806,7 +1806,7 @@ nextflow run nf-core/eager \ nf-core/eager will now take all unmapped reads after mapping and convert the BAM file back to FASTQ, which can be accepted by MALT. But of course, we also then need to tell nf-core/eager we actually want to run MALT. We will also specify -the location of the [pre-built database](#preparation) and which 'min support' +the location of the [pre-built database](##tutorial-metagenomics---preparation) and which 'min support' method we want to use (this specifies the minimum number of alignments is needed to a particular taxonomic node to be 'kept' in the MALT output files). Otherwise we will keep all other parameters as default. For example using BlastN mode, @@ -1878,7 +1878,7 @@ Porphyromonas ``` We have also specified the path to the HOPS resources [downloaded -earlier](#preparation), and that I want to turn off 'destacking' (removal of any +earlier](#tutorial-metagenomics---preparation), and that I want to turn off 'destacking' (removal of any read that overlaps the positions of another - something only recommended to keep on when you have high coverage data). @@ -1889,7 +1889,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Metagenomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [usage](#pipeline-options) that all parameters are as expected, or check +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -2515,7 +2515,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Pathogen Genomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. From 2ac8389b32b4e5eb56652f6aa80e845c1ecd8e29 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:35:48 +0200 Subject: [PATCH 11/15] Final markdown linting --- docs/output.md | 4 ++-- docs/usage.md | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/output.md b/docs/output.md index fcd345f16..9b6950779 100644 --- a/docs/output.md +++ b/docs/output.md @@ -670,7 +670,7 @@ If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to the s ## Output Files -This section gives a brief summary of where to look for what files for downstream analysis. This covers *all* modules. +This section gives a brief summary of where to look for what files for downstream analysis. This covers _all_ modules. Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. @@ -697,7 +697,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. * `metagenomic_classification/`: this contains the output for a given metagenomic classifier. * Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: + * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). _Kmer duplication is defined as: number of kmers / number of unique kmers_. You will find two kraken reports formats available: * the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) * the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. * finally, the `*.kraken.out` file are the direct output of Kraken2 diff --git a/docs/usage.md b/docs/usage.md index b63b6120a..133f986fc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,7 +115,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters -> *Important*: If running nf-core/eager on a cluster - ask your system +> _Important_: If running nf-core/eager on a cluster - ask your system > administrator what profile to use. **Institution Specific Profiles** These are profiles specific to certain **HPC @@ -349,7 +349,7 @@ Note the following important points and limitations for setting up: * The TSV must use actual tabs (not spaces) between cells. * The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. -* *File* names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). +* _File_ names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). * At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. * Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. * If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. @@ -586,7 +586,7 @@ the #eager channel). #### Tutorial Profiles - Background -A useful feature of Nextflow is the ability to use configuration *profiles* that +A useful feature of Nextflow is the ability to use configuration _profiles_ that can specify many default parameters and other settings on how to run your pipeline. @@ -617,7 +617,7 @@ levels in terms of memory usage, pipeline-level profiles can also assist in facilitating reproducible science by giving a way for researchers to 'publish' their exact pipeline parameters in way other users can automatically re-run the pipeline with the pipeline parameters used in the original publication but on -their *own* cluster. +their _own_ cluster. To illustrate this, lets say we analysed our data on a HPC called 'blue' for which an institutional profile already exists, and for our analysis we defined a @@ -689,7 +689,7 @@ defined in the `cluster` profile. > institutional-level profiles. Otherwise please skip to [Writing your own profile](#tutorial-profiles---writing-your-own-profile) In actuality, a nf-core/eager run already contains many configs and profiles, -and will normally use *multiple* configs profiles in a single run. Multiple +and will normally use _multiple_ configs profiles in a single run. Multiple configuration and profiles files can be used, and each new one selected will inherit all the previous one's parameters, and the parameters in the new one will then overwrite any that have been changed from the original. @@ -727,7 +727,7 @@ nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna < In the background, any parameters in the pipeline's `nextflow.config` (containing default parameters) will be overwritten by the -`old_dna_profile.config`. In addition, the `old_dna` *profile* will overwrite +`old_dna_profile.config`. In addition, the `old_dna` _profile_ will overwrite any parameters set in the config but outside the profile definition of `old_dna_profile.config`. @@ -764,7 +764,7 @@ if your run does not use the parameters you expect. > specifying a custom `.config` file by using `-C` (capital C) instead of `-c` > (which inherits previously specify parameters) -Another thing that is important to note is that if a specific *profile* is +Another thing that is important to note is that if a specific _profile_ is specified in `nextflow run`, this replaces any 'global' parameter that is specified within the config file (but outside a profile) itself - **regardless** of profile order (see above). @@ -1806,7 +1806,7 @@ nextflow run nf-core/eager \ nf-core/eager will now take all unmapped reads after mapping and convert the BAM file back to FASTQ, which can be accepted by MALT. But of course, we also then need to tell nf-core/eager we actually want to run MALT. We will also specify -the location of the [pre-built database](##tutorial-metagenomics---preparation) and which 'min support' +the location of the [pre-built database](#tutorial-metagenomics---preparation) and which 'min support' method we want to use (this specifies the minimum number of alignments is needed to a particular taxonomic node to be 'kept' in the MALT output files). Otherwise we will keep all other parameters as default. For example using BlastN mode, From 260d7804af28d0eed3899382684fbc044ebd3dcf Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:38:07 +0200 Subject: [PATCH 12/15] Fix inconssitency with template --- CODE_OF_CONDUCT.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4a82a677..5409a32a5 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -44,7 +44,7 @@ The safety officer in consultation with the nf-core core team have the right and Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply +## When are where does this Code of Conduct apply? Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: @@ -108,4 +108,4 @@ All reports will be handled with utmost discretion and confidentially. ### v1.0 - March 12th, 2021 -- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. +- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. \ No newline at end of file From 0d7fbbf57ffae755539c194501ad62a4b55b3609 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 08:39:07 +0200 Subject: [PATCH 13/15] Missing new line --- CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 5409a32a5..f4fd052f1 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -108,4 +108,4 @@ All reports will be handled with utmost discretion and confidentially. ### v1.0 - March 12th, 2021 -- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. \ No newline at end of file +- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. From 9b9173d88fafb73694913dde17e6cd98fab9c24c Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 10:03:41 +0200 Subject: [PATCH 14/15] Try custom yaml linting --- .github/workflows/linting.yml | 10 +++------- .github/yamllint.yml | 7 +++++++ 2 files changed, 10 insertions(+), 7 deletions(-) create mode 100644 .github/yamllint.yml diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77b4b9d07..8ce31bdcb 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -45,7 +45,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - YAML: runs-on: ubuntu-latest steps: @@ -55,7 +54,7 @@ jobs: - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") --config .github/yamllint.yml # If the above check failed, post a comment on the PR explaining the failure - name: Post PR comment @@ -82,11 +81,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - nf-core: runs-on: ubuntu-latest steps: - - name: Check out pipeline code uses: actions/checkout@v2 @@ -99,8 +96,8 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.6' - architecture: 'x64' + python-version: "3.6" + architecture: "x64" - name: Install dependencies run: | @@ -127,4 +124,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/yamllint.yml b/.github/yamllint.yml new file mode 100644 index 000000000..35ebcb16c --- /dev/null +++ b/.github/yamllint.yml @@ -0,0 +1,7 @@ +rules: + document-start: disable + comments: disable + truthy: disable + line-length: disable + empty-lines: disable + From a81503b6959214146cf8cdfe0269f908f89bb378 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Fri, 29 Jul 2022 10:16:04 +0200 Subject: [PATCH 15/15] fix yaml lint config param --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8ce31bdcb..47d45298a 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -54,7 +54,7 @@ jobs: - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") --config .github/yamllint.yml + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") -c .github/yamllint.yml # If the above check failed, post a comment on the PR explaining the failure - name: Post PR comment