nf-core · apeltzer · Jul 8, 2019 · Jun 9, 2019 · Jun 9, 2019 · Jun 10, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -11,9 +11,9 @@ before_install:
   # PRs to master are only ok if coming from dev branch
   - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
   # Pull the docker image first so the test doesn't wait for this
-  - docker pull bioinformant/bactmap:latest
+  - docker pull nfcore/bactmap:dev
   # Fake the tag locally so that the pipeline runs properly
-  - docker tag bioinformant/bactmap:latest bioinformant/bactmap:dev
+  - docker tag nfcore/bactmap:dev nfcore/bactmap:dev
 
 install:
   # Install Nextflow
@@ -22,16 +22,16 @@ install:
   - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow
   # Install nf-core/tools
   - pip install --upgrade pip 
-  - pip install nf-core
+  - pip install --upgrade --force-reinstall git+https://github.com/nf-core/tools.git@dev
+
   # Reset
   - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests
 
 env:
-  - NXF_VER='19.05.0' # Specify a minimum NF version that should be tested and work
-  - NXF_VER='' # Plus: get the latest NF version and check that it works
+  - NXF_VER='19.05.0-edge' # Specify a minimum NF version that should be tested and work
 
 script:
   # Lint the pipeline code
   - nf-core lint ${TRAVIS_BUILD_DIR}
   # Run the pipeline with the test profile
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM continuumio/miniconda3
+FROM nfcore/base
 LABEL authors="Anthony Underwood" \
       description="Docker image containing all requirements for nf-core/bactmap pipeline"
 
@@ -19,6 +19,5 @@ RUN wget https://download.asperasoft.com/download/sw/connect/3.8.1/ibm-aspera-co
     mv /root/.aspera /.aspera; \
     echo "[aspera]\nASPERA_BIN  = /.aspera/connect/bin/ascp\nASPERA_PRIVATE_KEY = /.aspera/connect/etc/asperaweb_id_dsa.openssh\nASPERA_OPTIONS =\nASPERA_SPEED = 100M" > /aspera.ini
 
-RUN cd /root; git clone https://github.com/enasequence/enaBrowserTools.git; mv enaBrowserTools /enaBrowserTools
 
-ENV PATH /enaBrowserTools/python3:/opt/conda/envs/nf-core-bactmap-1.0dev/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-bactmap-1.0dev/bin:$PATH
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 **A mapping-based pipeline for creating a phylogeny from bacterial whole genome sequences**
 
 [![Build Status](https://travis-ci.org/nf-core/bactmap.svg?branch=master)](https://travis-ci.org/nf-core/bactmap)
-[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.05.0-brightgreen.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.05.0--edge-brightgreen.svg)](https://www.nextflow.io/)
 
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/)
 [![Docker](https://img.shields.io/docker/automated/nfcore/bactmap.svg)](https://hub.docker.com/r/bioinformant/bactmap)

diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -9,6 +9,17 @@
     'Nextflow': ['v_nextflow.txt', r"(\S+)"],
     'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"],
     'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"],
+    'BWA': ['v_bwa.txt', r"Version: (\S+)"],
+    'SAMtools ': ['v_samtools.txt', r"Version: (\S+)"],
+    'BCFtools': ['v_bcftools.txt', r"Version: (\S+)"],
+    'PySam': ['v_pysam.txt', r"(\S+)"],
+    'Trimmomatic': ['v_trimmomatic.txt', r"(\S+)"],
+    'Mash': ['v_mash.txt', r"Mash version (\S+)"],
+    'Seqtk': ['v_seqtk.txt', r"Version: (\S+)"],
+    'IQtree': ['v_iqtree.txt', r"version (\S+)"],
+    'SNP-sites': ['v_snp-sites.txt', r"snp-sites: (\S+)"],
+    'Gubbins': ['v_gubbins.txt', r"Version: (\S+)"],
+    'FastTree': ['v_fasttree.txt', r"(version \S+)"]
 }
 results = OrderedDict()
 results['nf-core/bactmap'] = '<span style="color:#999999;\">N/A</span>'
@@ -24,9 +35,14 @@
         if match:
             results[k] = "v{}".format(match.group(1))
 
+# Remove software set to false in results
+for k in results:
+    if not results[k]:
+        del(results[k])
+
 # Dump to YAML
 print ('''
-id: 'nf-core/bactmap-software-versions'
+id: 'software_versions'
 section_name: 'nf-core/bactmap Software Versions'
 section_href: 'https://github.com/nf-core/bactmap'
 plot_type: 'html'
@@ -35,5 +51,10 @@
     <dl class="dl-horizontal">
 ''')
 for k,v in results.items():
-    print("        <dt>{}</dt><dd>{}</dd>".format(k,v))
+    print("        <dt>{}</dt><dd><samp>{}</samp></dd>".format(k,v))
 print ("    </dl>")
+
+# Write out regexes as csv file:
+with open('software_versions.csv', 'w') as f:
+    for k,v in results.items():
+        f.write("{}\t{}\n".format(k,v))
diff --git a/conf/base.config b/conf/base.config
@@ -9,22 +9,21 @@
  * run on the logged in environment.
  */
 
-docker.enabled = true
-process.container = 'bioinformant/bactmap:latest'
-
 process {
+
   cpus = { check_max( 1, 'cpus' ) }
   memory = { check_max( 2.GB, 'memory' ) }
-  time = '4hr'
+  time = 24.h
 
-  errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'finish' }
+  errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
   maxRetries = 1
   maxErrors = '-1'
 }
 
 params {
-  // Defaults only, expecting to be overwritten
-  max_memory = 64.GB
+  max_memory = 8.GB
   max_cpus = 8
   max_time = 240.h
+  memory_for_tree_building = 15.GB
+  cpus_for_tree_building = 8
 }
diff --git a/conf/test.config b/conf/test.config
@@ -12,6 +12,8 @@ params {
   max_cpus = 2
   max_memory = 6.GB
   max_time = 48.h
+  memory_for_tree_building = 4.GB
+  cpus_for_tree_building = 2
   // Input data
   read_paths = [
     [

diff --git a/docs/output.md b/docs/output.md
@@ -1,41 +1,84 @@
 # nf-core/bactmap: Output
 
-This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.
-
-<!-- TODO nf-core: Write this documentation describing your workflow's output -->
+This document describes the output produced by the pipeline. 
 
 ## Pipeline overview
 The pipeline is built using [Nextflow](https://www.nextflow.io/)
 and processes data using the following steps:
 
-* [FastQC](#fastqc) - read quality control
-* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline
 
-## FastQC
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences.
+* [Fetch from ENA](#fetch-from-ena) (Optional) Fetch reads from the ENA
+* [Trim Reads](#trim-reads) Read trimmimg using trimmomatic 
+* [Estimate genome size](#genome-size-estimation)
+* [Downsample reads](#downsample-reads) 
+* [Map reads](#map-reads)
+* [Call variants](#call-variants)
+* [Filter variant](#filter-variants)
+* [Pseudogenome creation](#pseudogenome-creation)
+* [Pseudogenome alignment creation](#pseudogenome-alignment-creation)
+* [Recombination removal](#recombination-removal)(Optional)
+* [Invariant site removal](#invariant-site-removal)
+* [Phylogenetic tree creation](#phylogenetic-tree-removal) (Optional) 
+
+
+
+## Fetch from ENA
+This process will fetch reads from the ENA archive using the `enaDataGet` tool from [ENA Browser Tools](https://github.com/enasequence/enaBrowserTools)
+
+## Trim Reads
+Trim with [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) reads based on the parameters ILLUMINACLIP:adapter_file.fas:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 and with MIN_LEN dynamically determined based on 30% of the read length
+**Output directory: `<OUTPUT DIR>/trimmed_fastqs`**
+Fastq files post trimming will be written here
+
+## Genome Size Estimation
+Estimate the size of the genome using [Mash](https://mash.readthedocs.io/en/latest/)
+
+## Downsample reads
+If the `--depth_cutoff` parameter is specified then reads will be downsampled using [seqtk](https://github.com/lh3/seqtk) to the specified depth
 
-For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
 
-> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory.
+## Map reads
+The reads will be mapped to the specified reference genome using [bwa mem](https://github.com/lh3/bwa)
+**Output directory: `<OUTPUT DIR>/sorted_bams`**
+Sorted bam files will be written here
 
-**Output directory: `results/fastqc`**
+## Call variants
+Variants will be called using [samtools](http://www.htslib.org/doc/samtools.html)
 
-* `sample_fastqc.html`
-  * FastQC report, containing quality metrics for your untrimmed raw fastq files
-* `zips/sample_fastqc.zip`
-  * zip file containing the FastQC report, tab-delimited data file and plot images
+## Filter variants
+Variants will be filtered using [bcftools](http://www.htslib.org/doc/bcftools.html) in order to flag low quality SNPs using the default filter of `%QUAL<25 || FORMAT/DP<10 || MAX(FORMAT/ADF)<5 || MAX(FORMAT/ADR)<5 || MAX(FORMAT/AD)/SUM(FORMAT/DP)<0.9 || MQ<30 || MQ0F>0.1` 
+**Output directory: `<OUTPUT DIR>/filtered_bcfs`**
+Filtered vcf files will be written here
+## Pseudogenome creation
+A pseudogenome based on the variants called is created where missing positions are encoded as `-` characters and low quality positions as `N`. All other positions either match the reference or are encoded as a SNV of either G,A,T or C. The script [filtered_bcf_to_fasta.py](bin/filtered_bcf_to_fasta.py) is used.
+**Output directory: `<OUTPUT DIR>/pseudogenomes`**
+A pseudogenome for each sample will be written here
 
+## Pseudogenome alignment creation
+The pseudogenomes from the previous step are concatenanted to make a whole genome alignment
+**Output directory: `<OUTPUT DIR>/pseudogenomes`**
+The multi-sample pseudogenome alignment will be written here
 
-## MultiQC
-[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory.
+## Recombination removal
+Recombination is removed from the alignment using [gubbins](https://github.com/sanger-pathogens/gubbins/)
 
-The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability.
+## Invariant sites
+Invariant sites are removed using [snp-sites](https://github.com/sanger-pathogens/snp-sites)
 
-**Output directory: `results/multiqc`**
+## Phylogenetic tree creation
+A Maximum likelihood tree is generated using [IQ-TREE](http://www.iqtree.org)
+**Output directory: `<OUTPUT DIR>`**
+The consensus tree `aligned_pseudogenome.variants_only.contree` including bootstrap values will be written here
 
-* `Project_multiqc_report.html`
-  * MultiQC report - a standalone HTML file that can be viewed in your web browser
-* `Project_multiqc_data/`
-  * Directory containing parsed statistics from the different tools used in the pipeline
+# Software used within the pipeline
+  - [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) A flexible read trimming tool for Illumina NGS data.
+  - [mash](https://mash.readthedocs.io/en/latest/) Fast genome and metagenome distance estimation using MinHash.
+  - [seqtk](https://github.com/lh3/seqtk) A fast and lightweight tool for processing sequences in the FASTA or FASTQ format.
+  - [bwa mem](https://github.com/lh3/bwa) Burrow-Wheeler Aligner for short-read alignment
+  - [samtools](http://www.htslib.org/doc/samtools.html) Utilities for the Sequence Alignment/Map (SAM) format
+  - [bcftools](http://www.htslib.org/doc/bcftools.html) Utilities for variant calling and manipulating VCFs and BCFs
+  - [filtered_bcf_to_fasta.py](bin/filtered_bcf_to_fasta.py) Python utility to create a pseudogenome from a bcf file where each position in the reference genome is included
+  - [gubbins](https://github.com/sanger-pathogens/gubbins/) Rapid phylogenetic analysis of large samples of recombinant bacterial whole genome sequences
+  - [snp-sites](https://github.com/sanger-pathogens/snp-sites) Finds SNP sites from a multi-FASTA alignment file 
+  - [IQ-TREE](http://www.iqtree.org) Efficient software for phylogenomic inference
 
-For more information about how to use MultiQC reports, see http://multiqc.info
diff --git a/environment.yml b/environment.yml
@@ -17,4 +17,5 @@ dependencies:
   - iqtree=1.6.8
   - snp-sites=2.4.1
   - gubbins=2.3.4
-  - fasttree=2.1.10
+  - fasttree=2.1.10
+  - enabrowsertools=1.5.4
diff --git a/lib/bactmap_processes.nf b/lib/bactmap_processes.nf
@@ -231,8 +231,8 @@ process create_pseudogenome_alignment{
 //  remove non-informative positions with snp-sites
 
 process create_variant_only_alignment{
-  memory '15GB'
-  cpus 4
+  memory { params.memory_for_tree_building * task.attempt }
+  cpus params.cpus_for_tree_building
 
   tag { 'create variant only pseudogenome alignment' }
 
@@ -261,8 +261,8 @@ process create_variant_only_alignment{
 
 // Build ML tree
 process build_tree {
-  memory { 15.GB * task.attempt }
-  cpus 4
+  memory { params.memory_for_tree_building * task.attempt }
+  cpus params.cpus_for_tree_building
 
   tag {'build tree'}
   publishDir "${params.outdir}",