Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ before_install:
# PRs to master are only ok if coming from dev branch
- '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
# Pull the docker image first so the test doesn't wait for this
- docker pull bioinformant/bactmap:latest
- docker pull nfcore/bactmap:dev
# Fake the tag locally so that the pipeline runs properly
- docker tag bioinformant/bactmap:latest bioinformant/bactmap:dev
- docker tag nfcore/bactmap:dev nfcore/bactmap:dev

install:
# Install Nextflow
Expand All @@ -22,16 +22,16 @@ install:
- sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow
# Install nf-core/tools
- pip install --upgrade pip
- pip install nf-core
- pip install --upgrade --force-reinstall git+https://github.com/nf-core/tools.git@dev

# Reset
- mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests

env:
- NXF_VER='19.05.0' # Specify a minimum NF version that should be tested and work
- NXF_VER='' # Plus: get the latest NF version and check that it works
- NXF_VER='19.05.0-edge' # Specify a minimum NF version that should be tested and work

script:
# Lint the pipeline code
- nf-core lint ${TRAVIS_BUILD_DIR}
# Run the pipeline with the test profile
- nextflow run ${TRAVIS_BUILD_DIR} -profile test
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker
5 changes: 2 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM continuumio/miniconda3
FROM nfcore/base
LABEL authors="Anthony Underwood" \
description="Docker image containing all requirements for nf-core/bactmap pipeline"

Expand All @@ -19,6 +19,5 @@ RUN wget https://download.asperasoft.com/download/sw/connect/3.8.1/ibm-aspera-co
mv /root/.aspera /.aspera; \
echo "[aspera]\nASPERA_BIN = /.aspera/connect/bin/ascp\nASPERA_PRIVATE_KEY = /.aspera/connect/etc/asperaweb_id_dsa.openssh\nASPERA_OPTIONS =\nASPERA_SPEED = 100M" > /aspera.ini

RUN cd /root; git clone https://github.com/enasequence/enaBrowserTools.git; mv enaBrowserTools /enaBrowserTools

ENV PATH /enaBrowserTools/python3:/opt/conda/envs/nf-core-bactmap-1.0dev/bin:$PATH
ENV PATH /opt/conda/envs/nf-core-bactmap-1.0dev/bin:$PATH
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
**A mapping-based pipeline for creating a phylogeny from bacterial whole genome sequences**

[![Build Status](https://travis-ci.org/nf-core/bactmap.svg?branch=master)](https://travis-ci.org/nf-core/bactmap)
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.05.0-brightgreen.svg)](https://www.nextflow.io/)
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.05.0--edge-brightgreen.svg)](https://www.nextflow.io/)

[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/)
[![Docker](https://img.shields.io/docker/automated/nfcore/bactmap.svg)](https://hub.docker.com/r/bioinformant/bactmap)
Expand Down
25 changes: 23 additions & 2 deletions bin/scrape_software_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@
'Nextflow': ['v_nextflow.txt', r"(\S+)"],
'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"],
'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"],
'BWA': ['v_bwa.txt', r"Version: (\S+)"],
'SAMtools ': ['v_samtools.txt', r"Version: (\S+)"],
'BCFtools': ['v_bcftools.txt', r"Version: (\S+)"],
'PySam': ['v_pysam.txt', r"(\S+)"],
'Trimmomatic': ['v_trimmomatic.txt', r"(\S+)"],
'Mash': ['v_mash.txt', r"Mash version (\S+)"],
'Seqtk': ['v_seqtk.txt', r"Version: (\S+)"],
'IQtree': ['v_iqtree.txt', r"version (\S+)"],
'SNP-sites': ['v_snp-sites.txt', r"snp-sites: (\S+)"],
'Gubbins': ['v_gubbins.txt', r"Version: (\S+)"],
'FastTree': ['v_fasttree.txt', r"(version \S+)"]
}
results = OrderedDict()
results['nf-core/bactmap'] = '<span style="color:#999999;\">N/A</span>'
Expand All @@ -24,9 +35,14 @@
if match:
results[k] = "v{}".format(match.group(1))

# Remove software set to false in results
for k in results:
if not results[k]:
del(results[k])

# Dump to YAML
print ('''
id: 'nf-core/bactmap-software-versions'
id: 'software_versions'
section_name: 'nf-core/bactmap Software Versions'
section_href: 'https://github.com/nf-core/bactmap'
plot_type: 'html'
Expand All @@ -35,5 +51,10 @@
<dl class="dl-horizontal">
''')
for k,v in results.items():
print(" <dt>{}</dt><dd>{}</dd>".format(k,v))
print(" <dt>{}</dt><dd><samp>{}</samp></dd>".format(k,v))
print (" </dl>")

# Write out regexes as csv file:
with open('software_versions.csv', 'w') as f:
for k,v in results.items():
f.write("{}\t{}\n".format(k,v))
13 changes: 6 additions & 7 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,21 @@
* run on the logged in environment.
*/

docker.enabled = true
process.container = 'bioinformant/bactmap:latest'

process {

cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 2.GB, 'memory' ) }
time = '4hr'
time = 24.h

errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'finish' }
errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'
}

params {
// Defaults only, expecting to be overwritten
max_memory = 64.GB
max_memory = 8.GB
max_cpus = 8
max_time = 240.h
memory_for_tree_building = 15.GB
cpus_for_tree_building = 8
}
2 changes: 2 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ params {
max_cpus = 2
max_memory = 6.GB
max_time = 48.h
memory_for_tree_building = 4.GB
cpus_for_tree_building = 2
// Input data
read_paths = [
[
Expand Down
89 changes: 66 additions & 23 deletions docs/output.md
Original file line number Diff line number Diff line change
@@ -1,41 +1,84 @@
# nf-core/bactmap: Output

This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.

<!-- TODO nf-core: Write this documentation describing your workflow's output -->
This document describes the output produced by the pipeline.

## Pipeline overview
The pipeline is built using [Nextflow](https://www.nextflow.io/)
and processes data using the following steps:

* [FastQC](#fastqc) - read quality control
* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline

## FastQC
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences.
* [Fetch from ENA](#fetch-from-ena) (Optional) Fetch reads from the ENA
* [Trim Reads](#trim-reads) Read trimmimg using trimmomatic
* [Estimate genome size](#genome-size-estimation)
* [Downsample reads](#downsample-reads)
* [Map reads](#map-reads)
* [Call variants](#call-variants)
* [Filter variant](#filter-variants)
* [Pseudogenome creation](#pseudogenome-creation)
* [Pseudogenome alignment creation](#pseudogenome-alignment-creation)
* [Recombination removal](#recombination-removal)(Optional)
* [Invariant site removal](#invariant-site-removal)
* [Phylogenetic tree creation](#phylogenetic-tree-removal) (Optional)



## Fetch from ENA
This process will fetch reads from the ENA archive using the `enaDataGet` tool from [ENA Browser Tools](https://github.com/enasequence/enaBrowserTools)

## Trim Reads
Trim with [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) reads based on the parameters ILLUMINACLIP:adapter_file.fas:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 and with MIN_LEN dynamically determined based on 30% of the read length
**Output directory: `<OUTPUT DIR>/trimmed_fastqs`**
Fastq files post trimming will be written here

## Genome Size Estimation
Estimate the size of the genome using [Mash](https://mash.readthedocs.io/en/latest/)

## Downsample reads
If the `--depth_cutoff` parameter is specified then reads will be downsampled using [seqtk](https://github.com/lh3/seqtk) to the specified depth

For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).

> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory.
## Map reads
The reads will be mapped to the specified reference genome using [bwa mem](https://github.com/lh3/bwa)
**Output directory: `<OUTPUT DIR>/sorted_bams`**
Sorted bam files will be written here

**Output directory: `results/fastqc`**
## Call variants
Variants will be called using [samtools](http://www.htslib.org/doc/samtools.html)

* `sample_fastqc.html`
* FastQC report, containing quality metrics for your untrimmed raw fastq files
* `zips/sample_fastqc.zip`
* zip file containing the FastQC report, tab-delimited data file and plot images
## Filter variants
Variants will be filtered using [bcftools](http://www.htslib.org/doc/bcftools.html) in order to flag low quality SNPs using the default filter of `%QUAL<25 || FORMAT/DP<10 || MAX(FORMAT/ADF)<5 || MAX(FORMAT/ADR)<5 || MAX(FORMAT/AD)/SUM(FORMAT/DP)<0.9 || MQ<30 || MQ0F>0.1`
**Output directory: `<OUTPUT DIR>/filtered_bcfs`**
Filtered vcf files will be written here
## Pseudogenome creation
A pseudogenome based on the variants called is created where missing positions are encoded as `-` characters and low quality positions as `N`. All other positions either match the reference or are encoded as a SNV of either G,A,T or C. The script [filtered_bcf_to_fasta.py](bin/filtered_bcf_to_fasta.py) is used.
**Output directory: `<OUTPUT DIR>/pseudogenomes`**
A pseudogenome for each sample will be written here

## Pseudogenome alignment creation
The pseudogenomes from the previous step are concatenanted to make a whole genome alignment
**Output directory: `<OUTPUT DIR>/pseudogenomes`**
The multi-sample pseudogenome alignment will be written here

## MultiQC
[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory.
## Recombination removal
Recombination is removed from the alignment using [gubbins](https://github.com/sanger-pathogens/gubbins/)

The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability.
## Invariant sites
Invariant sites are removed using [snp-sites](https://github.com/sanger-pathogens/snp-sites)

**Output directory: `results/multiqc`**
## Phylogenetic tree creation
A Maximum likelihood tree is generated using [IQ-TREE](http://www.iqtree.org)
**Output directory: `<OUTPUT DIR>`**
The consensus tree `aligned_pseudogenome.variants_only.contree` including bootstrap values will be written here

* `Project_multiqc_report.html`
* MultiQC report - a standalone HTML file that can be viewed in your web browser
* `Project_multiqc_data/`
* Directory containing parsed statistics from the different tools used in the pipeline
# Software used within the pipeline
- [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) A flexible read trimming tool for Illumina NGS data.
- [mash](https://mash.readthedocs.io/en/latest/) Fast genome and metagenome distance estimation using MinHash.
- [seqtk](https://github.com/lh3/seqtk) A fast and lightweight tool for processing sequences in the FASTA or FASTQ format.
- [bwa mem](https://github.com/lh3/bwa) Burrow-Wheeler Aligner for short-read alignment
- [samtools](http://www.htslib.org/doc/samtools.html) Utilities for the Sequence Alignment/Map (SAM) format
- [bcftools](http://www.htslib.org/doc/bcftools.html) Utilities for variant calling and manipulating VCFs and BCFs
- [filtered_bcf_to_fasta.py](bin/filtered_bcf_to_fasta.py) Python utility to create a pseudogenome from a bcf file where each position in the reference genome is included
- [gubbins](https://github.com/sanger-pathogens/gubbins/) Rapid phylogenetic analysis of large samples of recombinant bacterial whole genome sequences
- [snp-sites](https://github.com/sanger-pathogens/snp-sites) Finds SNP sites from a multi-FASTA alignment file
- [IQ-TREE](http://www.iqtree.org) Efficient software for phylogenomic inference

For more information about how to use MultiQC reports, see http://multiqc.info
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ dependencies:
- iqtree=1.6.8
- snp-sites=2.4.1
- gubbins=2.3.4
- fasttree=2.1.10
- fasttree=2.1.10
- enabrowsertools=1.5.4
8 changes: 4 additions & 4 deletions lib/bactmap_processes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ process create_pseudogenome_alignment{
// remove non-informative positions with snp-sites

process create_variant_only_alignment{
memory '15GB'
cpus 4
memory { params.memory_for_tree_building * task.attempt }
cpus params.cpus_for_tree_building

tag { 'create variant only pseudogenome alignment' }

Expand Down Expand Up @@ -261,8 +261,8 @@ process create_variant_only_alignment{

// Build ML tree
process build_tree {
memory { 15.GB * task.attempt }
cpus 4
memory { params.memory_for_tree_building * task.attempt }
cpus params.cpus_for_tree_building

tag {'build tree'}
publishDir "${params.outdir}",
Expand Down
Loading