bactopia · rpetit3 · Jun 22, 2021 · Feb 18, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 /datasets/
+test_data
 /conda/envs
 /conf/aws.config
 .nextflow*

diff --git a/main.nf b/main.nf
diff --git a/modules/ariba/ariba_analysis/README.md b/modules/ariba/ariba_analysis/README.md
@@ -0,0 +1,16 @@
+# ariba_analysis process testing:
+
+This process run reads against all available (if any) ARIBA datasets 
+## About testing this process:
+
+Using DSL2 each module can be tested separately, using a test workflow inside the process.nf file, testing requires 3 itens:  
+- the local files in `test_data` 
+- params in  `test_params.yaml`
+- `test` profile in `nextflow.config`
+
+## How to test it:
+
+$ nextflow run ariba_analysis.nf -params-file test_params.yaml -profile test,docker -entry test
+
+
+if you've used `bactopia conda activate` you can also trade `docker` by conda to test with conda. 
diff --git a/modules/ariba/ariba_analysis/ariba_analysis.nf b/modules/ariba/ariba_analysis/ariba_analysis.nf
@@ -0,0 +1,51 @@
+nextflow.enable.dsl = 2
+
+process ARIBA_ANALYSIS {
+    /* Run reads against all available (if any) ARIBA datasets */
+    tag "${sample} - ${dataset_name}"
+
+    publishDir "${outdir}/${sample}/logs", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${task.process}/*"
+    publishDir "${outdir}/${sample}/ariba", mode: "${params.publish_mode}", overwrite: params.overwrite, pattern: "${dataset_name}/*"
+
+    input:
+    tuple val(sample), val(single_end), path(fq)
+    each path(dataset)
+
+    output:
+    file "${dataset_name}/*"
+    file "${task.process}/*" optional true
+
+    when:
+    single_end == false && ARIBA_DATABASES.isEmpty() == false
+
+    shell:
+    dataset_tarball = path(dataset).getName()
+    dataset_name = dataset_tarball.replace('.tar.gz', '')
+    spades_options = params.spades_options ? "--spades_options '${params.spades_options}'" : ""
+    noclean = params.ariba_no_clean ? "--noclean" : ""
+
+    template "ariba_analysis.sh"
+    stub:
+    dataset_tarball = path(dataset).getName()
+    dataset_name = dataset_tarball.replace('.tar.gz', '')
+    """
+    mkdir ${dataset_name}
+    mkdir ${task.process}
+    touch ${dataset_name}/${sample}
+    touch ${task.process}/${sample}
+    """
+}
+
+//###############
+//Module testing
+//###############
+
+workflow test {
+    TEST_PARAMS_CH = Channel.of([
+        params.sample,
+        params.single_end,
+        path(params.fq)
+        ])
+    TEST_PARAMS_CH2 = Channel.of(path(params.card),path(params.vfdb))
+    ariba_analysis(TEST_PARAMS_CH,TEST_PARAMS_CH2.collect())
+}
diff --git a/modules/ariba/ariba_analysis/bin/build-containers.sh b/modules/ariba/ariba_analysis/bin/build-containers.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# build-containers
+#
+# Automate the building of Bactopia related containers
+VERSION=1.6.0
+CONTAINER_VERSION="${VERSION%.*}.x"
+
+function singularity_build {
+    recipe=$1
+    name=$2
+    image=$3
+    version=$4
+    latest=${5:-0}
+
+    echo "Working on ${recipe}"
+    singularity build -F ${image} ${recipe}
+    singularity sign ${image}
+    singularity push ${image} library://rpetit3/bactopia/${name}:${version}
+
+    if [[ "${latest}" == "1" ]]; then
+        singularity push ${image} library://rpetit3/bactopia/${name}:latest
+    fi
+}
+
+function docker_build {
+    recipe=$1
+    image=$2
+    latest=${3:-0}
+
+    echo "Working on ${recipe}"
+    docker build --rm -t ${image} -f ${recipe} .
+    docker push ${image}
+
+    if [[ "${latest}" != "0" ]]; then
+        docker tag ${image} ${latest}
+        docker push ${latest}
+    fi
+}
+
+
+if [[ $# == 0 ]]; then
+    echo ""
+    echo "build-containers.sh BACTOPIA_DIR OUTPUT_DIR"
+    echo ""
+    echo "Example Command"
+    echo "build-containers.sh /home/bactopia/bactopia container-images/ "
+    echo ""
+    exit
+fi
+
+BACTOPIA_DIR=$1
+OUTPUT_DIR=${2:-"./"}
+if [ -z  ${BACTOPIA_DIR} ]; then
+    echo "Got ${#} arguement"
+    echo "Must give the path to Bactopia repository"
+    exit 1
+fi
+MAJOR_VERSION=${3:-"0"}
+
+mkdir -p ${OUTPUT_DIR}
+
+# Build Bactopia containers
+#singularity_build Singularity bactopia ${OUTPUT_DIR}/bactopia-${VERSION}.simg ${VERSION} 1
+#docker_build Dockerfile bactopia/bactopia:${VERSION} bactopia/bactopia:latest
+
+if [ "${MAJOR_VERSION}" == "1" ]; then
+    # Build Singularity
+    for recipe in $(ls "${BACTOPIA_DIR}/containers/singularity" | grep ".Singularity"); do
+        recipe_path="${BACTOPIA_DIR}/containers/singularity/${recipe}"
+        recipe_name=$(echo ${recipe} | sed 's/.Singularity//')
+        recipe_image="${OUTPUT_DIR}/${recipe_name}-${CONTAINER_VERSION}.simg"
+        singularity_build ${recipe_path} ${recipe_name} ${recipe_image} ${CONTAINER_VERSION}
+    done
+
+    # Build Docker
+    docker_build Dockerfile bactopia/bactopia:${CONTAINER_VERSION} bactopia/bactopia:latest
+    for recipe in $(ls "${BACTOPIA_DIR}/containers/docker" | grep ".Dockerfile"); do
+        recipe_path="${BACTOPIA_DIR}/containers/docker/${recipe}"
+        recipe_name=$(echo ${recipe} | sed 's/.Dockerfile//')
+        recipe_image="bactopia/${recipe_name}:${CONTAINER_VERSION}"
+        #docker_build ${recipe_path} ${recipe_image}
+    done
+
+    # Build Bactopia Tools containers
+    for tool in $(ls "${BACTOPIA_DIR}/tools"); do
+        recipe_path="${BACTOPIA_DIR}/tools/${tool}"
+        docker_file="${recipe_path}/Dockerfile"
+        docker_image="bactopia/tools-${tool}:${CONTAINER_VERSION}"
+        #docker_build ${docker_file} ${docker_image}
+
+        singularity_file="${recipe_path}/Singularity"
+        singularity_image="${OUTPUT_DIR}/tools-${tool}-${CONTAINER_VERSION}.simg"
+        singularity_build ${singularity_file} "tools-${tool}" ${singularity_image} ${CONTAINER_VERSION}
+    done
+fi
diff --git a/modules/ariba/ariba_analysis/bin/check-assembly-accession.py b/modules/ariba/ariba_analysis/bin/check-assembly-accession.py
@@ -0,0 +1,79 @@
+#! /usr/bin/env python3
+"""
+"""
+PROGRAM = "check-assembly-accession"
+VERSION = "1.6.0"
+
+
+def check_assembly_version(accession):
+    from Bio import Entrez
+    import time
+    import json
+    Entrez.email = "[email protected]"
+    Entrez.tool = "BactopiaCheckAssemblyAccession"
+
+    handle = Entrez.esearch(db="assembly", term=accession, retmax="500")
+    record = Entrez.read(handle, validate=False)
+    time.sleep(1)  # Be kind to NCBI
+
+    if len(record["IdList"]):
+        handle = Entrez.esummary(db="assembly", id=",".join(record["IdList"]))
+        record = Entrez.read(handle, validate=False)
+
+        time.sleep(1)  # Be kind to NCBI
+
+        records = []
+        excluded = set()
+        for assembly in record['DocumentSummarySet']["DocumentSummary"]:
+            if assembly["ExclFromRefSeq"]:
+                # PGAP can cause some Assemblies to eventually become excluded from RefSeq
+                # https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/
+                for reason in assembly["ExclFromRefSeq"]:
+                    excluded.add(reason)
+            else:
+                records.append(assembly["AssemblyAccession"])
+
+        if excluded:
+            return [','.join(list(excluded)), True]
+        else:
+            return [sorted(records, reverse=True)[0], False]
+    else:
+
+        return [f'No records found for {accession}', True]
+
+
+if __name__ == '__main__':
+    import argparse as ap
+    from collections import defaultdict
+    import random
+    import sys
+    parser = ap.ArgumentParser(
+        prog=PROGRAM,
+        conflict_handler='resolve',
+        description=(
+            f'{PROGRAM} (v{VERSION}) - Verifies NCBI Assembly accession is latest and still available'
+        )
+    )
+
+    parser.add_argument(
+        'reference', metavar="STR", type=str,
+        help='NCBI Assembly accession to be tested.'
+    )
+    parser.add_argument('--version', action='version',
+                        version=f'{PROGRAM} {VERSION}')
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(0)
+
+    args = parser.parse_args()
+    reference = args.reference.split('.')[0]
+    current_accession, excluded = check_assembly_version(reference)
+    if excluded:
+        print(
+            f'Skipping {reference}. Reason: {current_accession}',
+            file=sys.stderr
+        )
+    else:
+        print(f'Using {current_accession} for {args.reference}', file=sys.stderr)
+        print(current_accession)
diff --git a/modules/ariba/ariba_analysis/bin/check-fastqs.py b/modules/ariba/ariba_analysis/bin/check-fastqs.py
@@ -0,0 +1,109 @@
+#! /usr/bin/env python3
+"""
+Sometimes with AWS, files might fail to download but not cause an error.
+This script checks to verify all expected inputs are staged.
+"""
+PROGRAM = "check-staging"
+VERSION = "1.6.0"
+import sys
+
+
+def read_json(json_file):
+    import json
+    json_data = None
+    with open(json_file, 'rt') as json_fh:
+        json_data = json.load(json_fh)
+    return json_data
+
+
+def write_error(filename, error_msg):
+    print(error_msg, file=sys.stderr)
+    with open(filename, "wt") as fh_out:
+        fh_out.write(error_msg)
+    return 1
+
+
+def check_reads(fq1, sample, min_reads, fq2=None):
+    error = 0
+    total_reads = fq1 + fq2 if fq2 else fq1
+
+    if total_reads < min_reads:
+        error_msg = (f"{sample} FASTQ(s) contain {total_reads} total reads. This does not \n"
+                    f"exceed the required minimum {min_reads} read count. Further analysis is \n"
+                    "discontinued.\n")
+        error += write_error(f'{sample}-low-read-count-error.txt', error_msg)
+
+    if fq2:
+        if fq1 != fq2:
+            # different number of reads in the pair
+            error_msg = (f"{sample} FASTQs have different read counts (R1: {fq1}, R2: {fq2}). Please \n"
+                        "investigate these FASTQs. Further analysis is discontinued.\n")
+            error += write_error(f'{sample}-different-read-count-error.txt', error_msg)
+
+    return error
+
+
+def check_basepairs(fq1, sample, min_basepairs, fq2=None, min_proportion=None):
+    error = 0
+    total_bp= fq1 + fq2 if fq2 else fq1
+
+    if total_bp < min_basepairs:
+        error_msg = (f"{sample} FASTQ(s) contain {total_bp} total basepairs. This does not \n"
+                    f"exceed the required minimum {min_basepairs} bp. Further analysis is \n"
+                    "discontinued.\n")
+        error += write_error(f'{sample}-low-sequence-depth-error.txt', error_msg)
+
+    if fq2:
+        proportion = float(fq1) / float(fq2) if fq1 < fq2 else float(fq2) / float(fq1)
+        if proportion < min_proportion:
+            # More basepairs in one sample that exceeds minimum proportion
+            error_msg = (f"{sample} FASTQs failed to meet the minimum shared basepairs ({min_proportion}). \n"
+                        f"They shared {proportion:.4f} basepairs, with R1 having {fq1} bp and \n"
+                        f"R2 having {fq2} bp. Further analysis is discontinued.\n")
+            error += write_error(f'{sample}-low-basepair-proportion-error.txt', error_msg)
+
+    return error
+
+
+if __name__ == '__main__':
+    import argparse as ap
+    import os
+    parser = ap.ArgumentParser(
+        prog=PROGRAM,
+        conflict_handler='resolve',
+        description=(
+            f'{PROGRAM} (v{VERSION}) - Verifies inputs for a process are available.'
+        )
+    )
+
+    parser.add_argument('--sample', metavar="STR", type=str, help='Name of the input sample.')
+    parser.add_argument('--fq1', metavar="STR", type=str, help='Stats for SE or R1 FASTQ in JSON format.')
+    parser.add_argument('--fq2', metavar="STR", type=str, help='Stats for R2 FASTQ in JSON format.')
+    parser.add_argument('--min_proportion', metavar="FLOAT", type=float, 
+                        help='The proportion of sequenced basepairs that R1 and R2 must be')
+    parser.add_argument('--min_reads', metavar="INT", type=int, help='Minimum number of reads.')
+    parser.add_argument('--min_basepairs',metavar="INT", type=int, help='Minimum number of seqeunced basepairs')
+    parser.add_argument('--version', action='version', version=f'{PROGRAM} {VERSION}')
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(0)
+
+    args = parser.parse_args()
+
+    error = 0
+    if args.fq1 and args.fq2:
+        # Paired end
+        r1 = read_json(args.fq1)
+        r2 = read_json(args.fq2)
+        error += check_reads(r1["qc_stats"]["read_total"], args.sample, args.min_reads, 
+                             fq2=r2["qc_stats"]["read_total"])
+        error += check_basepairs(r1["qc_stats"]["total_bp"], args.sample, args.min_basepairs, 
+                                 fq2=r2["qc_stats"]["total_bp"], min_proportion=args.min_proportion)
+
+    else:
+        se = read_json(args.fq1)
+        error += check_reads(se["qc_stats"]["read_total"], args.sample, args.min_reads)
+        error += check_basepairs(se["qc_stats"]["total_bp"], args.sample, args.min_basepairs)
+
+    sys.exit(error)